Hi Chris,<br>Did this (and other patches that you wrote) get merged yet? Did you create an account? I'm interested in your work and would like to test it.<br><br><div class="gmail_quote">On Wed, May 6, 2009 at 10:28 AM, Christoph Bumiller <span dir="ltr"><<a href="mailto:e0425955@student.tuwien.ac.at" target="_blank">e0425955@student.tuwien.ac.at</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin: 0pt 0pt 0pt 0.8ex; border-left: 1px solid rgb(204, 204, 204); padding-left: 1ex;">Hi ! I've been trying to improve NV50 shader generation a bit the last couple of weeks, so here is<br>
what I've produced. I don't know if it's usable for you or just a pile of horrible hacks, but at<br>
least it makes some mesa demos render more correcly, p.e. the teapot (aside from mip-mapping issues<br>
of the floor texture), arbfplight, and I think the gears also didn't appear as they should before,<br>
and I hope it doesn't break others that worked.<br>
I also tried playing neverball and neverputt, which at some point worked fine, but now it locks up<br>
the GPU again after a certain (short) amount of time. That's probably not related to my<br>
modifications, because it crashes without the patches as well (plus has some flickering and other<br>
graphics errors). It seems to work OK if I run it with valgrind, though.<br>
There also are and have been some random graphics errors that spam the kernel log with invalid<br>
method NV50TCL_VERTEX_END, so if something doesn't look right, try to restart the program, or toggle<br>
some options in the mesa demos (show help, etc.).<br>
<br>
There might, as always, be some bugs in the patches, of course, and the they probably can't be<br>
committed unmodified. I've not put them in the email text but as attachments because there's rather<br>
many changes. There's a short description (commit log) in each patch, but I hope the code speaks for<br>
itself, otherwise I'll provide more explanation / add more comments ... later.<br>
These don't represent everything I've tried to improve, but the rest isn't in any usable shape yet.<br>
<br>
If anyone who knows their way around the gallium code has time, please have a look and tell me what<br>
you think. Thank you.<br>
<font color="#888888"><br>
Christoph<br>
</font><br>commit 7ab9fc73707be46375668e557b5a5c1a373096ad<br>
Author: chr <chr@LAPTOP.(none)><br>
Date: Sun May 3 21:03:35 2009 +0200<br>
<br>
Remove some memory leaks: free allocated temp in all opcode cases<br>
of tx_insn; free nv50_regs for immds in LIT and those allocated in<br>
tgsi_src.<br>
Make LRP use 2 instructions (SUB,MAD) instead of 3 (NEG,MAD,MAD).<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 2d15868..1a94327 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -28,6 +28,7 @@<br>
#include "pipe/p_shader_tokens.h"<br>
#include "tgsi/tgsi_parse.h"<br>
#include "tgsi/tgsi_util.h"<br>
+#include "tgsi/tgsi_dump.h"<br>
<br>
#include "nv50_context.h"<br>
<br>
@@ -795,12 +796,6 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);<br>
struct nv50_reg *tmp[4];<br>
<br>
- if (mask & (1 << 0))<br>
- emit_mov(pc, dst[0], one);<br>
-<br>
- if (mask & (1 << 3))<br>
- emit_mov(pc, dst[3], one);<br>
-<br>
if (mask & (3 << 1)) {<br>
if (mask & (1 << 1))<br>
tmp[0] = dst[1];<br>
@@ -823,6 +818,18 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
emit_mov(pc, dst[2], zero);<br>
set_pred(pc, 3, 0, pc->p->exec_tail);<br>
}<br>
+<br>
+ /* do this last, in case src[i,j] == dst[0,3] */<br>
+ if (mask & (1 << 0))<br>
+ emit_mov(pc, dst[0], one);<br>
+<br>
+ if (mask & (1 << 3))<br>
+ emit_mov(pc, dst[3], one);<br>
+<br>
+ FREE(pos128);<br>
+ FREE(neg128);<br>
+ FREE(zero);<br>
+ FREE(one);<br>
}<br>
<br>
static void<br>
@@ -885,8 +892,9 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)<br>
{<br>
struct nv50_reg *r = NULL;<br>
struct nv50_reg *temp;<br>
- unsigned c;<br>
+ unsigned sgn, c;<br>
<br>
+ sgn = tgsi_util_get_full_src_register_sign_mode(src, chan);<br>
c = tgsi_util_get_full_src_register_extswizzle(src, chan);<br>
switch (c) {<br>
case TGSI_EXTSWIZZLE_X:<br>
@@ -915,16 +923,18 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)<br>
break;<br>
case TGSI_EXTSWIZZLE_ZERO:<br>
r = alloc_immd(pc, 0.0);<br>
- break;<br>
+ return r;<br>
case TGSI_EXTSWIZZLE_ONE:<br>
- r = alloc_immd(pc, 1.0);<br>
- break;<br>
+ if (sgn == TGSI_UTIL_SIGN_TOGGLE || sgn == TGSI_UTIL_SIGN_SET)<br>
+ return alloc_immd(pc, -1.0);<br>
+ else<br>
+ return alloc_immd(pc, 1.0);<br>
default:<br>
assert(0);<br>
break;<br>
}<br>
<br>
- switch (tgsi_util_get_full_src_register_sign_mode(src, chan)) {<br>
+ switch (sgn) {<br>
case TGSI_UTIL_SIGN_KEEP:<br>
break;<br>
case TGSI_UTIL_SIGN_CLEAR:<br>
@@ -955,7 +965,7 @@ static boolean<br>
nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
{<br>
const struct tgsi_full_instruction *inst = &tok->FullInstruction;<br>
- struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp;<br>
+ struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp = NULL;<br>
unsigned mask, sat, unit;<br>
int i, c;<br>
<br>
@@ -1021,7 +1031,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
continue;<br>
emit_mov(pc, dst[c], temp);<br>
}<br>
- free_temp(pc, temp);<br>
break;<br>
case TGSI_OPCODE_DP4:<br>
temp = alloc_temp(pc, NULL);<br>
@@ -1034,7 +1043,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
continue;<br>
emit_mov(pc, dst[c], temp);<br>
}<br>
- free_temp(pc, temp);<br>
break;<br>
case TGSI_OPCODE_DPH:<br>
temp = alloc_temp(pc, NULL);<br>
@@ -1047,7 +1055,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
continue;<br>
emit_mov(pc, dst[c], temp);<br>
}<br>
- free_temp(pc, temp);<br>
break;<br>
case TGSI_OPCODE_DST:<br>
{<br>
@@ -1072,7 +1079,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
continue;<br>
emit_mov(pc, dst[c], temp);<br>
}<br>
- free_temp(pc, temp);<br>
break;<br>
case TGSI_OPCODE_FLR:<br>
for (c = 0; c < 4; c++) {<br>
@@ -1089,7 +1095,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
emit_flr(pc, temp, src[0][c]);<br>
emit_sub(pc, dst[c], src[0][c], temp);<br>
}<br>
- free_temp(pc, temp);<br>
break;<br>
case TGSI_OPCODE_KIL:<br>
emit_kil(pc, src[0][0]);<br>
@@ -1110,15 +1115,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
}<br>
break;<br>
case TGSI_OPCODE_LRP:<br>
+ temp = alloc_temp(pc, NULL);<br>
for (c = 0; c < 4; c++) {<br>
if (!(mask & (1 << c)))<br>
continue;<br>
- /*XXX: we can do better than this */<br>
- temp = alloc_temp(pc, NULL);<br>
- emit_neg(pc, temp, src[0][c]);<br>
- emit_mad(pc, temp, temp, src[2][c], src[2][c]);<br>
- emit_mad(pc, dst[c], src[0][c], src[1][c], temp);<br>
- free_temp(pc, temp);<br>
+ emit_sub(pc, temp, src[1][c], src[2][c]);<br>
+ emit_mad(pc, dst[c], temp, src[0][c], src[2][c]);<br>
}<br>
break;<br>
case TGSI_OPCODE_MAD:<br>
@@ -1164,7 +1166,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
continue;<br>
emit_mov(pc, dst[c], temp);<br>
}<br>
- free_temp(pc, temp);<br>
break;<br>
case TGSI_OPCODE_RCP:<br>
for (c = 0; c < 4; c++) {<br>
@@ -1259,7 +1260,6 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
emit_mul(pc, temp, src[0][1], src[1][0]);<br>
emit_msb(pc, dst[2], src[0][0], src[1][1], temp);<br>
}<br>
- free_temp(pc, temp);<br>
break;<br>
case TGSI_OPCODE_END:<br>
break;<br>
@@ -1268,6 +1268,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
return FALSE;<br>
}<br>
<br>
+ if (temp)<br>
+ free_temp(pc, temp);<br>
+<br>
if (sat) {<br>
for (c = 0; c < 4; c++) {<br>
struct nv50_program_exec *e;<br>
@@ -1288,6 +1291,19 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
}<br>
}<br>
<br>
+ for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {<br>
+ for (c = 0; c < 4; c++) {<br>
+ if (!src[i][c])<br>
+ continue;<br>
+ if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)<br>
+ FREE(src[i][c]);<br>
+<br>
+ /* Might also release temporaries not used anymore in this loop,<br>
+ * therefore no temp_immd and temp_immd_nr like for temp_temp.<br>
+ */<br>
+ }<br>
+ }<br>
+<br>
kill_temp_temp(pc);<br>
return TRUE;<br>
}<br>
<br>commit 93d8cfb3e13179d6ed28c4989cefc92389008f0b<br>
Author: chr <chr@LAPTOP.(none)><br>
Date: Tue May 5 20:54:43 2009 +0200<br>
<br>
- extend nv50_pc to track insn nr, add allow half insn boolean<br>
- extend nv50_reg to record insn of last use and FP output hw index<br>
- add some functions for later use<br>
- modify alloc_reg to prefer final FP output hw if set<br>
- record interpolation mode in tx_prep<br>
- count number of insns in tx_prep<br>
- record depth output, and position and color input indices<br>
- inspect instructions for register usage<br>
- set pc->allow32 to FALSE on first and last insn<br>
<br>
shouldn't change generated shader code yet<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 1a94327..cb92a31 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -86,6 +86,9 @@ struct nv50_reg {<br>
<br>
int hw;<br>
int neg;<br>
+<br>
+ int rhw; /* result hw for FP outputs */<br>
+ int acc; /* instruction where this reg is last read (first insn == 1) */<br>
};<br>
<br>
struct nv50_pc {<br>
@@ -109,6 +112,12 @@ struct nv50_pc {<br>
<br>
struct nv50_reg *temp_temp[16];<br>
unsigned temp_temp_nr;<br>
+<br>
+ /* current instruction and total number of insns */<br>
+ unsigned insn_cur;<br>
+ unsigned insn_nr;<br>
+<br>
+ boolean allow32; /* TRUE when half insns are allowed */<br>
};<br>
<br>
static void<br>
@@ -132,7 +141,24 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)<br>
return;<br>
}<br>
<br>
- for (i = 0; i < NV50_SU_MAX_TEMP; i++) {<br>
+ i = 0;<br>
+ if (reg->rhw != -1) {<br>
+ /* try to allocate temporary with index rhw first */<br>
+ if (!(pc->r_temp[reg->rhw])) {<br>
+ pc->r_temp[reg->rhw] = reg;<br>
+ reg->hw = reg->rhw;<br>
+ if (pc->p->cfg.high_temp < (reg->rhw + 1))<br>
+ pc->p->cfg.high_temp = reg->rhw + 1;<br>
+ return;<br>
+ }<br>
+ /* If we can't allocate the final destination index of the output,<br>
+ * put it in a high temporary so we need not shuffle around later.<br>
+ * (like, $r0 needs to go in $r1 and $r1 in $r0 etc.)<br>
+ */<br>
+ i = pc->result_nr * 4;<br>
+ }<br>
+<br>
+ for (; i < NV50_SU_MAX_TEMP; i++) {<br>
if (!(pc->r_temp[i])) {<br>
pc->r_temp[i] = reg;<br>
reg->hw = i;<br>
@@ -160,6 +186,7 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)<br>
r->type = P_TEMP;<br>
r->index = -1;<br>
r->hw = i;<br>
+ r->rhw = -1;<br>
pc->r_temp[i] = r;<br>
return r;<br>
}<br>
@@ -169,6 +196,56 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)<br>
return NULL;<br>
}<br>
<br>
+static struct nv50_reg *<br>
+alloc_preferred_temp(struct nv50_pc *pc, int hw)<br>
+{<br>
+ struct nv50_reg *r;<br>
+<br>
+ if (hw >= NV50_SU_MAX_TEMP || hw == -1 || pc->r_temp[hw])<br>
+ return alloc_temp(pc, NULL);<br>
+<br>
+ r = CALLOC_STRUCT(nv50_reg);<br>
+ r->type = P_TEMP;<br>
+ r->index = -1;<br>
+ r->hw = hw;<br>
+ r->rhw = -1;<br>
+ pc->r_temp[hw] = r;<br>
+<br>
+ return r;<br>
+}<br>
+<br>
+/* Assign the hw of the discarded temporary register src<br>
+ * to the tgsi register dst and free src.<br>
+ */<br>
+static void<br>
+assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)<br>
+{<br>
+ assert(dst->index != -1 && src->index == -1 && src->hw != -1);<br>
+<br>
+ if (dst->hw != -1)<br>
+ pc->r_temp[dst->hw] = NULL;<br>
+ pc->r_temp[src->hw] = dst;<br>
+ dst->hw = src->hw;<br>
+<br>
+ FREE(src);<br>
+}<br>
+<br>
+/* release the hardware resource held by r */<br>
+static void<br>
+release_hw(struct nv50_pc *pc, struct nv50_reg *r)<br>
+{<br>
+ assert(r->type == P_TEMP);<br>
+ if (r->hw == -1)<br>
+ return;<br>
+<br>
+ assert(pc->r_temp[r->hw] == r);<br>
+ pc->r_temp[r->hw] = NULL;<br>
+<br>
+ r->acc = 0;<br>
+ if (r->index == -1)<br>
+ FREE(r);<br>
+}<br>
+<br>
static void<br>
free_temp(struct nv50_pc *pc, struct nv50_reg *r)<br>
{<br>
@@ -251,7 +328,14 @@ alloc_immd(struct nv50_pc *pc, float f)<br>
struct nv50_reg *r = CALLOC_STRUCT(nv50_reg);<br>
unsigned hw;<br>
<br>
- hw = ctor_immd(pc, f, 0, 0, 0) * 4;<br>
+ /* don't allocate more space if the value is already there */<br>
+ for (hw = 0; hw < pc->immd_nr * 4; ++hw)<br>
+ if (pc->immd_buf[hw] == f)<br>
+ break;<br>
+<br>
+ if (hw == pc->immd_nr * 4)<br>
+ hw = ctor_immd(pc, f, 0, 0, 0) * 4;<br>
+<br>
r->type = P_IMMD;<br>
r->hw = hw;<br>
r->index = -1;<br>
@@ -355,6 +439,12 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
e->inst[1] |= (val >> 6) << 2;<br>
}<br>
<br>
+<br>
+#define INTERP_LINEAR 0<br>
+#define INTERP_FLAT 1<br>
+#define INTERP_PERSPECTIVE 2<br>
+#define INTERP_CENTROID 4<br>
+<br>
static void<br>
emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,<br>
struct nv50_reg *src, struct nv50_reg *iv)<br>
@@ -535,6 +625,14 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)<br>
e->inst[1] |= (src->hw << 14);<br>
}<br>
<br>
+static boolean<br>
+requires_long(struct nv50_program_exec *e, struct nv50_reg *src)<br>
+{<br>
+ if (is_long(e) || src->type == P_IMMD || src->type == P_CONST)<br>
+ return TRUE;<br>
+ return FALSE;<br>
+}<br>
+<br>
static void<br>
emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0,<br>
struct nv50_reg *src1)<br>
@@ -870,6 +968,62 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src)<br>
emit(pc, e);<br>
}<br>
<br>
+static void<br>
+emit_nop(struct nv50_pc *pc, boolean l)<br>
+{<br>
+ struct nv50_program_exec *e = exec(pc);<br>
+<br>
+ e->inst[0] = 0xF0000000;<br>
+ if (l) {<br>
+ set_long(pc, e);<br>
+ e->inst[1] = 0xE0000000;<br>
+ }<br>
+<br>
+ emit(pc, e);<br>
+}<br>
+<br>
+/* Adjust a bitmask that indicates what components of a source are used,<br>
+ * we use this in tx_prep so we only load interpolants that are needed.<br>
+ */<br>
+static void<br>
+insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)<br>
+{<br>
+ const struct tgsi_instruction_ext_texture *tex;<br>
+<br>
+ switch (insn->Instruction.Opcode) {<br>
+ case TGSI_OPCODE_DP3:<br>
+ *mask = 0x7;<br>
+ break;<br>
+ case TGSI_OPCODE_DP4:<br>
+ case TGSI_OPCODE_DPH:<br>
+ *mask = 0xF;<br>
+ break;<br>
+ case TGSI_OPCODE_LIT:<br>
+ *mask = 0xB;<br>
+ break;<br>
+ case TGSI_OPCODE_RCP:<br>
+ case TGSI_OPCODE_RSQ:<br>
+ *mask = 0x1;<br>
+ break;<br>
+ case TGSI_OPCODE_TXP:<br>
+ *mask = 0x8;<br>
+ /* fall through to TEX */<br>
+ case TGSI_OPCODE_TEX:<br>
+ assert(insn->Instruction.Extended);<br>
+ tex = &insn->InstructionExtTexture;<br>
+<br>
+ if (tex->Texture == TGSI_TEXTURE_1D)<br>
+ *mask |= 0x1;<br>
+ else<br>
+ if (tex->Texture == TGSI_TEXTURE_2D)<br>
+ *mask |= 0x3;<br>
+ else<br>
+ *mask |= 0x7;<br>
+ default:<br>
+ break;<br>
+ }<br>
+}<br>
+<br>
static struct nv50_reg *<br>
tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)<br>
{<br>
@@ -1308,12 +1462,53 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
return TRUE;<br>
}<br>
<br>
+static void<br>
+set_acc_array(unsigned *p, const struct tgsi_full_src_register *src,<br>
+ unsigned mask, unsigned n)<br>
+{<br>
+ unsigned k, c;<br>
+<br>
+ for (c = 0; c < 4; c++) {<br>
+ if (!(mask & (1 << c)))<br>
+ continue;<br>
+<br>
+ k = tgsi_util_get_full_src_register_extswizzle(src, c);<br>
+ switch (k) {<br>
+ case TGSI_EXTSWIZZLE_X:<br>
+ case TGSI_EXTSWIZZLE_Y:<br>
+ case TGSI_EXTSWIZZLE_Z:<br>
+ case TGSI_EXTSWIZZLE_W:<br>
+ p[src->SrcRegister.Index * 4 + k] = n;<br>
+ break;<br>
+ default:<br>
+ break;<br>
+ }<br>
+ }<br>
+}<br>
+<br>
static boolean<br>
nv50_program_tx_prep(struct nv50_pc *pc)<br>
{<br>
struct tgsi_parse_context p;<br>
boolean ret = FALSE;<br>
unsigned i, c;<br>
+ unsigned fcol, bcol, fcrd, depr;<br>
+<br>
+ /* record interpolation mode from declaration */<br>
+ boolean centroid_load = FALSE;<br>
+ boolean perspect_load = FALSE;<br>
+ unsigned interp_mode[32];<br>
+<br>
+ /* track register usage for temps and attrs */<br>
+ unsigned *last_t_use = NULL;<br>
+ unsigned *last_a_use = NULL;<br>
+<br>
+ depr = fcol = bcol = fcrd = 0xFFFFFFFF;<br>
+<br>
+ if (pc->p->type == PIPE_SHADER_FRAGMENT) {<br>
+ pc->p->cfg.fp.regs[0] = 0x01000404;<br>
+ pc->p->cfg.fp.regs[1] = 0x00000400;<br>
+ }<br>
<br>
tgsi_parse_init(&p, pc->p->pipe.tokens);<br>
while (!tgsi_parse_end_of_tokens(&p)) {<br>
@@ -1326,6 +1521,10 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
const struct tgsi_full_immediate *imm =<br>
&p.FullToken.FullImmediate;<br>
<br>
+#ifdef NV50_PROGRAM_DUMP<br>
+ tgsi_dump_immediate(imm);<br>
+#endif<br>
+<br>
ctor_immd(pc, imm->u.ImmediateFloat32[0].Float,<br>
imm->u.ImmediateFloat32[1].Float,<br>
imm->u.ImmediateFloat32[2].Float,<br>
@@ -1335,11 +1534,16 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
case TGSI_TOKEN_TYPE_DECLARATION:<br>
{<br>
const struct tgsi_full_declaration *d;<br>
- unsigned last;<br>
+ unsigned last, first, mode;<br>
<br>
d = &p.FullToken.FullDeclaration;<br>
+ first = d->DeclarationRange.First;<br>
last = d->DeclarationRange.Last;<br>
<br>
+#ifdef NV50_PROGRAM_DUMP<br>
+ tgsi_dump_declaration(d);<br>
+#endif<br>
+<br>
switch (d->Declaration.File) {<br>
case TGSI_FILE_TEMPORARY:<br>
if (pc->temp_nr < (last + 1))<br>
@@ -1348,10 +1552,71 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
case TGSI_FILE_OUTPUT:<br>
if (pc->result_nr < (last + 1))<br>
pc->result_nr = last + 1;<br>
+<br>
+ if (!d->Declaration.Semantic)<br>
+ break;<br>
+<br>
+ switch (d->Semantic.SemanticName) {<br>
+ case TGSI_SEMANTIC_POSITION:<br>
+ depr = first;<br>
+ pc->p->cfg.fp.regs[2] |= 0x00000100;<br>
+ pc->p->cfg.fp.regs[3] |= 0x00000011;<br>
+ break;<br>
+ default:<br>
+ break;<br>
+ }<br>
break;<br>
case TGSI_FILE_INPUT:<br>
+ {<br>
if (pc->attr_nr < (last + 1))<br>
pc->attr_nr = last + 1;<br>
+<br>
+ if (pc->p->type != PIPE_SHADER_FRAGMENT)<br>
+ break;<br>
+<br>
+ switch (d->Declaration.Interpolate) {<br>
+ case TGSI_INTERPOLATE_CONSTANT:<br>
+ mode = INTERP_FLAT;<br>
+ break;<br>
+ case TGSI_INTERPOLATE_PERSPECTIVE:<br>
+ mode = INTERP_PERSPECTIVE;<br>
+ perspect_load = TRUE;<br>
+ break;<br>
+ default:<br>
+ mode = INTERP_LINEAR;<br>
+ break;<br>
+ }<br>
+<br>
+ if (d->Declaration.Semantic) {<br>
+ switch (d->Semantic.SemanticName) {<br>
+ case TGSI_SEMANTIC_POSITION:<br>
+ fcrd = first;<br>
+ break;<br>
+ case TGSI_SEMANTIC_COLOR:<br>
+ fcol = first;<br>
+ mode = INTERP_PERSPECTIVE;<br>
+ perspect_load = TRUE;<br>
+ break;<br>
+ case TGSI_SEMANTIC_BCOLOR:<br>
+ bcol = first;<br>
+ mode = INTERP_PERSPECTIVE;<br>
+ perspect_load = TRUE;<br>
+ break;<br>
+ default:<br>
+ break;<br>
+ }<br>
+ }<br>
+<br>
+ if (d->Declaration.Centroid) {<br>
+ mode |= INTERP_CENTROID;<br>
+ centroid_load = TRUE;<br>
+ perspect_load = FALSE;<br>
+ }<br>
+<br>
+ assert(last < 32);<br>
+ for (i = first; i <= last; i++)<br>
+ interp_mode[i] = mode;<br>
+ }<br>
break;<br>
case TGSI_FILE_CONSTANT:<br>
if (pc->param_nr < (last + 1))<br>
@@ -1367,6 +1632,43 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
}<br>
break;<br>
case TGSI_TOKEN_TYPE_INSTRUCTION:<br>
+ {<br>
+ const struct tgsi_full_instruction *insn;<br>
+ const struct tgsi_full_src_register *src;<br>
+ const struct tgsi_dst_register *dst;<br>
+ unsigned mask;<br>
+<br>
+ pc->insn_nr++;<br>
+<br>
+ if (!last_t_use) {<br>
+ last_t_use = CALLOC(pc->temp_nr * 4, sizeof(unsigned));<br>
+ last_a_use = CALLOC(pc->attr_nr * 4, sizeof(unsigned));<br>
+ }<br>
+<br>
+ insn = &tok->FullInstruction;<br>
+ dst = &insn->FullDstRegisters[0].DstRegister;<br>
+ mask = dst->WriteMask;<br>
+<br>
+#ifdef NV50_PROGRAM_DUMP<br>
+ tgsi_dump_instruction(insn, 1);<br>
+#endif<br>
+ if (dst->File == TGSI_FILE_TEMPORARY) {<br>
+ for (c = 0; c < 4; c++)<br>
+ if (mask & (1 << c))<br>
+ last_t_use[dst->Index * 4 + c] = pc->insn_nr;<br>
+ }<br>
+<br>
+ for (i = 0; i < insn->Instruction.NumSrcRegs; ++i) {<br>
+ src = &insn->FullSrcRegisters[i];<br>
+ insn_adjust_mask(insn, &mask);<br>
+<br>
+ if (src->SrcRegister.File == TGSI_FILE_TEMPORARY)<br>
+ set_acc_array(last_t_use, src, mask, pc->insn_nr);<br>
+ else<br>
+ if (src->SrcRegister.File == TGSI_FILE_INPUT)<br>
+ set_acc_array(last_a_use, src, mask, pc->insn_nr);<br>
+ }<br>
+ }<br>
break;<br>
default:<br>
break;<br>
@@ -1487,6 +1789,11 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
}<br>
}<br>
<br>
+ if (last_t_use)<br>
+ FREE(last_t_use);<br>
+ if (last_a_use)<br>
+ FREE(last_a_use);<br>
+<br>
ret = TRUE;<br>
out_err:<br>
tgsi_parse_free(&p);<br>
@@ -1516,8 +1823,15 @@ nv50_program_tx(struct nv50_program *p)<br>
<br>
tgsi_parse_token(&parse);<br>
<br>
+ /* don't allow half insn on first and last (not END) instruction */<br>
+ if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr)<br>
+ pc->allow32 = FALSE;<br>
+ else<br>
+ pc->allow32 = TRUE;<br>
+<br>
switch (tok->Token.Type) {<br>
case TGSI_TOKEN_TYPE_INSTRUCTION:<br>
+ ++pc->insn_cur;<br>
ret = nv50_program_tx_insn(pc, tok);<br>
if (ret == FALSE)<br>
goto out_err;<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h<br>
index 78deed6..3b3b6bb 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.h<br>
+++ b/src/gallium/drivers/nv50/nv50_program.h<br>
@@ -39,6 +39,11 @@ struct nv50_program {<br>
struct {<br>
unsigned attr[2];<br>
} vp;<br>
+ struct {<br>
+ unsigned regs[4];<br>
+ unsigned map[4];<br>
+ unsigned high_map;<br>
+ } fp;<br>
} cfg;<br>
};<br>
<br>
<br>commit ebcc4b9cf61a25d8ef2fa87eecfb5e4e75b47bca<br>
Author: chr <chr@LAPTOP.(none)><br>
Date: Tue May 5 20:56:12 2009 +0200<br>
<br>
- more correct loading FP interpolants, also consider interpolation mode<br>
- use tgsi resource nv50_regs to store attributes<br>
- improve values of shader registers<br>
- make sure FP depth output goes where it's supposed to go<br>
- loop through all instructions and make sure there are no single half insns<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index cb92a31..9acf882 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -445,20 +445,29 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
#define INTERP_PERSPECTIVE 2<br>
#define INTERP_CENTROID 4<br>
<br>
+/* interpolant index has been stored in dst->rhw */<br>
static void<br>
-emit_interp(struct nv50_pc *pc, struct nv50_reg *dst,<br>
- struct nv50_reg *src, struct nv50_reg *iv)<br>
+emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv,<br>
+ unsigned mode)<br>
{<br>
+ assert(dst->rhw != -1);<br>
struct nv50_program_exec *e = exec(pc);<br>
<br>
e->inst[0] |= 0x80000000;<br>
set_dst(pc, dst, e);<br>
- alloc_reg(pc, src);<br>
- e->inst[0] |= (src->hw << 16);<br>
- if (iv) {<br>
- e->inst[0] |= (1 << 25);<br>
- alloc_reg(pc, iv);<br>
- e->inst[0] |= (iv->hw << 9);<br>
+ e->inst[0] |= (dst->rhw << 16);<br>
+<br>
+ if (mode & INTERP_FLAT) {<br>
+ e->inst[0] |= (1 << 8);<br>
+ } else {<br>
+ if (mode & INTERP_PERSPECTIVE) {<br>
+ e->inst[0] |= (1 << 25);<br>
+ alloc_reg(pc, iv);<br>
+ e->inst[0] |= (iv->hw << 9);<br>
+ }<br>
+<br>
+ if (mode & INTERP_CENTROID)<br>
+ e->inst[0] |= (1 << 24);<br>
}<br>
<br>
emit(pc, e);<br>
@@ -982,6 +991,43 @@ emit_nop(struct nv50_pc *pc, boolean l)<br>
emit(pc, e);<br>
}<br>
<br>
+static void<br>
+convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)<br>
+{<br>
+ unsigned q = 0, m = ~0;<br>
+<br>
+ assert(!is_long(e));<br>
+<br>
+ switch (e->inst[0] >> 28) {<br>
+ case 0x1:<br>
+ /* MOV */<br>
+ q = 0x0403c000;<br>
+ m = 0xFFFF7FFF;<br>
+ break;<br>
+ case 0x8:<br>
+ /* INTERP */<br>
+ m = ~0x02000000;<br>
+ if (e->inst[0] & 0x02000000)<br>
+ q = 0x00020000;<br>
+ break;<br>
+ case 0xC:<br>
+ /* MUL */<br>
+ break;<br>
+ case 0x9:<br>
+ /* RCP */<br>
+ break;<br>
+ default:<br>
+ assert(0);<br>
+ break;<br>
+ }<br>
+<br>
+ set_long(pc, e);<br>
+ pc->p->exec_size++;<br>
+<br>
+ e->inst[0] &= m;<br>
+ e->inst[1] |= q;<br>
+}<br>
+<br>
/* Adjust a bitmask that indicates what components of a source are used,<br>
* we use this in tx_prep so we only load interpolants that are needed.<br>
*/<br>
@@ -1005,20 +1051,21 @@ insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask)<br>
case TGSI_OPCODE_RSQ:<br>
*mask = 0x1;<br>
break;<br>
- case TGSI_OPCODE_TXP:<br>
- *mask = 0x8;<br>
- /* fall through to TEX */<br>
case TGSI_OPCODE_TEX:<br>
+ case TGSI_OPCODE_TXP:<br>
assert(insn->Instruction.Extended);<br>
tex = &insn->InstructionExtTexture;<br>
<br>
+ *mask = 0x7;<br>
if (tex->Texture == TGSI_TEXTURE_1D)<br>
- *mask |= 0x1;<br>
+ *mask = 0x1;<br>
else<br>
if (tex->Texture == TGSI_TEXTURE_2D)<br>
- *mask |= 0x3;<br>
- else<br>
- *mask |= 0x7;<br>
+ *mask = 0x3;<br>
+<br>
+ if (insn->Instruction.Opcode == TGSI_OPCODE_TXP)<br>
+ *mask |= 0x8;<br>
+ break;<br>
default:<br>
break;<br>
}<br>
@@ -1255,6 +1302,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
emit_kil(pc, src[0][1]);<br>
emit_kil(pc, src[0][2]);<br>
emit_kil(pc, src[0][3]);<br>
+ pc->p->cfg.fp.regs[2] |= 0x00100000;<br>
break;<br>
case TGSI_OPCODE_LIT:<br>
emit_lit(pc, &dst[0], mask, &src[0][0]);<br>
@@ -1503,7 +1551,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
unsigned *last_t_use = NULL;<br>
unsigned *last_a_use = NULL;<br>
<br>
- depr = fcol = bcol = fcrd = 0xFFFFFFFF;<br>
+ depr = fcol = bcol = fcrd = 0xFFFF;<br>
<br>
if (pc->p->type == PIPE_SHADER_FRAGMENT) {<br>
pc->p->cfg.fp.regs[0] = 0x01000404;<br>
@@ -1683,37 +1731,106 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
for (i = 0; i < pc->temp_nr; i++) {<br>
for (c = 0; c < 4; c++) {<br>
pc->temp[i*4+c].type = P_TEMP;<br>
- pc->temp[i*4+c].hw = -1;<br>
+ pc->temp[i*4+c].hw = pc->temp[i*4+c].rhw = -1;<br>
pc->temp[i*4+c].index = i;<br>
+ pc->temp[i*4+c].acc = last_t_use[i*4+c];<br>
}<br>
}<br>
}<br>
<br>
if (pc->attr_nr) {<br>
- struct nv50_reg *iv = NULL;<br>
- int aid = 0;<br>
+ struct nv50_reg *iv, *iv_c = NULL, *iv_p = NULL;<br>
+ int oid, off = 4, mid = 0, aid = 0;<br>
+<br>
+ /* off = VP output id offset to i*4 (oid = i*4 + off + c)<br>
+ * aid = FP attribute/interpolant id (incremented only for used attrs)<br>
+ * mid = VP output mapping field ID (HPOS not counted)<br>
+ */<br>
<br>
pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg));<br>
if (!pc->attr)<br>
goto out_err;<br>
<br>
+ i = 0;<br>
if (pc->p->type == PIPE_SHADER_FRAGMENT) {<br>
- iv = alloc_temp(pc, NULL);<br>
- emit_interp(pc, iv, iv, NULL);<br>
- emit_flop(pc, 0, iv, iv);<br>
- aid++;<br>
+<br>
+ if (fcrd != 0xFFFF) {<br>
+ assert(fcrd == 0); /* position input should always be 0 */<br>
+ i = 1;<br>
+ off = 0;<br>
+ for (c = 0; c < 4; ++c) {<br>
+ if (last_a_use[c] == 0)<br>
+ continue;<br>
+<br>
+ pc->attr[c].index = fcrd;<br>
+ pc->attr[c].type = P_TEMP;<br>
+ pc->attr[c].acc = last_a_use[c];<br>
+ pc->attr[c].hw = pc->attr[c].rhw = -1;<br>
+<br>
+ alloc_reg(pc, &pc->attr[c]);<br>
+ pc->attr[c].rhw = aid++;<br>
+<br>
+ emit_interp(pc, &pc->attr[c], NULL, INTERP_LINEAR);<br>
+ pc->p->cfg.fp.regs[1] |= (1 << (24 + c));<br>
+<br>
+ switch (c) {<br>
+ case 0:<br>
+ case 1:<br>
+ /* should probably do viewport stuff here */<br>
+ break;<br>
+ case 3:<br>
+ iv_p = &pc->attr[c];<br>
+ emit_flop(pc, 0, iv_p, iv_p);<br>
+ break;<br>
+ default:<br>
+ break;<br>
+ }<br>
+ }<br>
+ }<br>
+<br>
+ if (perspect_load && !iv_p) {<br>
+ iv_p = alloc_temp(pc, NULL);<br>
+ iv_p->rhw = aid++;<br>
+ emit_interp(pc, iv_p, NULL, INTERP_LINEAR);<br>
+ emit_flop(pc, 0, iv_p, iv_p);<br>
+ pc->p->cfg.fp.regs[1] |= 0x08000000;<br>
+ }<br>
+<br>
+ if (centroid_load) {<br>
+ iv_c = alloc_temp(pc, NULL);<br>
+ iv_c->rhw = iv_p ? aid - 1 : aid++;<br>
+ emit_interp(pc, iv_c, NULL, INTERP_CENTROID);<br>
+ emit_flop(pc, 0, iv_c, iv_c);<br>
+ pc->p->cfg.fp.regs[1] |= 0x08000000;<br>
+ }<br>
}<br>
<br>
- for (i = 0; i < pc->attr_nr; i++) {<br>
+ for (; i < pc->attr_nr; i++) {<br>
struct nv50_reg *a = &pc->attr[i*4];<br>
+ iv = (interp_mode[i] & INTERP_CENTROID) ? iv_c : iv_p;<br>
<br>
for (c = 0; c < 4; c++) {<br>
if (pc->p->type == PIPE_SHADER_FRAGMENT) {<br>
- struct nv50_reg *at =<br>
- alloc_temp(pc, NULL);<br>
- pc->attr[i*4+c].type = at->type;<br>
- pc->attr[i*4+c].hw = at->hw;<br>
- pc->attr[i*4+c].index = at->index;<br>
+ a[c].hw = a[c].rhw = -1;<br>
+ a[c].index = -1;<br>
+ if (last_a_use[i*4+c] == 0)<br>
+ continue;<br>
+<br>
+ if (i == fcol || i == bcol)<br>
+ pc->p->cfg.fp.regs[0] += 0x00010000;<br>
+ pc->p->cfg.fp.regs[1] += 0x00010000;<br>
+<br>
+ a[c].index = i;<br>
+ a[c].type = P_TEMP;<br>
+ a[c].acc = last_a_use[i*4+c];<br>
+<br>
+ alloc_reg(pc, &a[c]);<br>
+ a[c].rhw = aid++;<br>
+ emit_interp(pc, &a[c], iv, interp_mode[i]);<br>
+<br>
+ oid = off + i * 4 + c;<br>
+ pc->p->cfg.fp.map[mid / 4] |= oid << (8 * (mid % 4));<br>
+ mid++;<br>
} else {<br>
pc->p->cfg.vp.attr[aid/32] |=<br>
(1 << (aid % 32));<br>
@@ -1722,18 +1839,16 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
pc->attr[i*4+c].index = i;<br>
}<br>
}<br>
+ }<br>
<br>
- if (pc->p->type != PIPE_SHADER_FRAGMENT)<br>
- continue;<br>
+ if (pc->p->type == PIPE_SHADER_FRAGMENT) {<br>
+ pc->p->cfg.fp.high_map = (mid / 4) + ((mid % 4) ? 1 : 0);<br>
<br>
- emit_interp(pc, &a[0], &a[0], iv);<br>
- emit_interp(pc, &a[1], &a[1], iv);<br>
- emit_interp(pc, &a[2], &a[2], iv);<br>
- emit_interp(pc, &a[3], &a[3], iv);<br>
+ if (iv_p && iv_p->index == -1)<br>
+ free_temp(pc, iv_p);<br>
+ if (iv_c)<br>
+ free_temp(pc, iv_c);<br>
}<br>
-<br>
- if (iv)<br>
- free_temp(pc, iv);<br>
}<br>
<br>
if (pc->result_nr) {<br>
@@ -1748,9 +1863,15 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
if (pc->p->type == PIPE_SHADER_FRAGMENT) {<br>
pc->result[i*4+c].type = P_TEMP;<br>
pc->result[i*4+c].hw = -1;<br>
+ if (i == depr) {<br>
+ pc->result[i*4+c].rhw = ((c == 2) ?<br>
+ (pc->result_nr - 1) * 4 : -1);<br>
+ } else<br>
+ pc->result[i*4+c].rhw = rid++;<br>
} else {<br>
pc->result[i*4+c].type = P_RESULT;<br>
pc->result[i*4+c].hw = rid++;<br>
+ pc->result[i*4+c].rhw = -1;<br>
}<br>
pc->result[i*4+c].index = i;<br>
}<br>
@@ -1805,6 +1926,7 @@ nv50_program_tx(struct nv50_program *p)<br>
{<br>
struct tgsi_parse_context parse;<br>
struct nv50_pc *pc;<br>
+ unsigned i, k;<br>
boolean ret;<br>
<br>
pc = CALLOC_STRUCT(nv50_pc);<br>
@@ -1843,10 +1965,42 @@ nv50_program_tx(struct nv50_program *p)<br>
<br>
if (p->type == PIPE_SHADER_FRAGMENT) {<br>
struct nv50_reg out;<br>
-<br>
out.type = P_TEMP;<br>
- for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++)<br>
- emit_mov(pc, &out, &pc->result[out.hw]);<br>
+<br>
+ for (i = 0; i < pc->result_nr * 4; i++) {<br>
+ if (pc->result[i].rhw == -1)<br>
+ continue;<br>
+ if (pc->result[i].hw != pc->result[i].rhw) {<br>
+ out.hw = pc->result[i].rhw;<br>
+ emit_mov(pc, &out, &pc->result[i]);<br>
+ }<br>
+ if (pc->p->cfg.high_result < pc->result[i].rhw + 1)<br>
+ pc->p->cfg.high_result = pc->result[i].rhw + 1;<br>
+ }<br>
+ }<br>
+<br>
+ /* look for single half instructions and make them long */<br>
+ struct nv50_program_exec *e, *e_prev;<br>
+<br>
+ for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {<br>
+ if (!is_long(e))<br>
+ k++;<br>
+<br>
+ if (!e->next || is_long(e->next)) {<br>
+ if (k & 1)<br>
+ convert_to_long(pc, e);<br>
+ k = 0;<br>
+ }<br>
+<br>
+ if (e->next)<br>
+ e_prev = e;<br>
+ }<br>
+<br>
+ if (!is_long(pc->p->exec_tail)) {<br>
+ /* this may occur if moving FP results */<br>
+ assert(e_prev && !is_long(e_prev));<br>
+ convert_to_long(pc, e_prev);<br>
+ convert_to_long(pc, pc->p->exec_tail);<br>
}<br>
<br>
assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));<br>
@@ -1973,7 +2127,7 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)<br>
if (is_long(e))<br>
NOUVEAU_ERR("0x%08x\n", e->inst[1]);<br>
}<br>
-<br>
+ FREE(up);<br>
#endif<br>
<br>
up = ptr = MALLOC(p->exec_size * 4);<br>
@@ -2058,6 +2212,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)<br>
struct nouveau_grobj *tesla = nv50->screen->tesla;<br>
struct nv50_program *p = nv50->fragprog;<br>
struct nouveau_stateobj *so;<br>
+ unsigned i;<br>
<br>
if (!p->translated) {<br>
nv50_program_validate(nv50, p);<br>
@@ -2068,24 +2223,30 @@ nv50_fragprog_validate(struct nv50_context *nv50)<br>
nv50_program_validate_data(nv50, p);<br>
nv50_program_validate_code(nv50, p);<br>
<br>
- so = so_new(64, 2);<br>
+ so = so_new(32, 2);<br>
so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2);<br>
so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |<br>
NOUVEAU_BO_HIGH, 0, 0);<br>
so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |<br>
NOUVEAU_BO_LOW, 0, 0);<br>
so_method(so, tesla, 0x1904, 4);<br>
- so_data (so, 0x00040404); /* p: 0x01000404 */<br>
+ so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 etc. */<br>
so_data (so, 0x00000004);<br>
so_data (so, 0x00000000);<br>
so_data (so, 0x00000000);<br>
- so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */<br>
+ so_method(so, tesla, 0x16bc, 1 + p->cfg.fp.high_map);<br>
so_data (so, 0x03020100);<br>
- so_data (so, 0x07060504);<br>
- so_data (so, 0x0b0a0908);<br>
+ for (i = 0; i < p->cfg.fp.high_map; i++)<br>
+ so_data(so, p->cfg.fp.map[i]);<br>
so_method(so, tesla, 0x1988, 2);<br>
- so_data (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */<br>
+ so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 etc. */<br>
so_data (so, p->cfg.high_temp);<br>
+ so_method(so, tesla, 0x1298, 1);<br>
+ so_data (so, p->cfg.high_result);<br>
+ so_method(so, tesla, 0x19a8, 1);<br>
+ so_data (so, p->cfg.fp.regs[2]);<br>
+ so_method(so, tesla, 0x196c, 1);<br>
+ so_data (so, p->cfg.fp.regs[3]);<br>
so_method(so, tesla, 0x1414, 1);<br>
so_data (so, 0); /* program start offset */<br>
so_ref(so, &nv50->state.fragprog);<br>
<br>commit dacf2f879d63b5bf756da62eee901379336e7335<br>
Author: chr <chr@LAPTOP.(none)><br>
Date: Tue May 5 20:57:15 2009 +0200<br>
<br>
- avoid overwriting sources before they're used in cases where dst == src<br>
- add magical adjustment for register 1988 (I should find out how that<br>
really works)<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 9acf882..e4fc261 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -1162,12 +1162,40 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src)<br>
return r;<br>
}<br>
<br>
+/* returns TRUE if instruction can overwrite sources before they're read */<br>
+static boolean<br>
+direct2dest_op(const struct tgsi_full_instruction *insn)<br>
+{<br>
+ if (insn->Instruction.Saturate)<br>
+ return FALSE;<br>
+<br>
+ switch (insn->Instruction.Opcode) {<br>
+ case TGSI_OPCODE_COS:<br>
+ case TGSI_OPCODE_DP3:<br>
+ case TGSI_OPCODE_DP4:<br>
+ case TGSI_OPCODE_DPH:<br>
+ case TGSI_OPCODE_KIL:<br>
+ case TGSI_OPCODE_LIT:<br>
+ case TGSI_OPCODE_POW:<br>
+ case TGSI_OPCODE_RCP:<br>
+ case TGSI_OPCODE_RSQ:<br>
+ case TGSI_OPCODE_SCS:<br>
+ case TGSI_OPCODE_SIN:<br>
+ case TGSI_OPCODE_TEX:<br>
+ case TGSI_OPCODE_TXP:<br>
+ return FALSE;<br>
+ default:<br>
+ return TRUE;<br>
+ }<br>
+}<br>
+<br>
static boolean<br>
nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
{<br>
const struct tgsi_full_instruction *inst = &tok->FullInstruction;<br>
struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp = NULL;<br>
unsigned mask, sat, unit;<br>
+ boolean assimilate = FALSE;<br>
int i, c;<br>
<br>
mask = inst->FullDstRegisters[0].DstRegister.WriteMask;<br>
@@ -1178,6 +1206,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]);<br>
else<br>
dst[c] = NULL;<br>
+<br>
+ rdst[c] = NULL;<br>
+<br>
+ src[0][c] = NULL;<br>
+ src[1][c] = NULL;<br>
+ src[2][c] = NULL;<br>
}<br>
<br>
for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {<br>
@@ -1195,8 +1229,35 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
rdst[c] = dst[c];<br>
dst[c] = temp_temp(pc);<br>
}<br>
+ } else if (direct2dest_op(inst)) {<br>
+ for (c = 0; c < 4; c++) {<br>
+ if (!dst[c] || dst[c]->type != P_TEMP)<br>
+ continue;<br>
+<br>
+ for (i = c + 1; i < 4; i++) {<br>
+ if (dst[c] == src[0][i] ||<br>
+ dst[c] == src[1][i] ||<br>
+ dst[c] == src[2][i])<br>
+ break;<br>
+ }<br>
+ if (i == 4)<br>
+ continue;<br>
+<br>
+ assimilate = TRUE;<br>
+ rdst[c] = dst[c];<br>
+ dst[c] = alloc_preferred_temp(pc, rdst[c]->rhw);<br>
+ }<br>
+ } else if (inst->Instruction.Opcode == TGSI_OPCODE_LIT) {<br>
+ /* XXX: shouldn't give LIT an extra case here */<br>
+ if (src[0][1] == dst[1] ||<br>
+ src[0][3] == dst[1]) {<br>
+ assimilate = TRUE;<br>
+ rdst[1] = dst[1];<br>
+ dst[1] = alloc_temp(pc, NULL);<br>
+ }<br>
}<br>
<br>
+ i = -1;<br>
switch (inst->Instruction.Opcode) {<br>
case TGSI_OPCODE_ABS:<br>
for (c = 0; c < 4; c++) {<br>
@@ -1373,14 +1434,22 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
for (c = 0; c < 4; c++) {<br>
if (!(mask & (1 << c)))<br>
continue;<br>
- emit_flop(pc, 0, dst[c], src[0][0]);<br>
+ if (i == -1) {<br>
+ emit_flop(pc, 0, dst[c], src[0][0]);<br>
+ i = c;<br>
+ } else<br>
+ emit_mov(pc, dst[c], dst[i]);<br>
}<br>
break;<br>
case TGSI_OPCODE_RSQ:<br>
for (c = 0; c < 4; c++) {<br>
if (!(mask & (1 << c)))<br>
continue;<br>
- emit_flop(pc, 2, dst[c], src[0][0]);<br>
+ if (i == -1) {<br>
+ emit_flop(pc, 2, dst[c], src[0][0]);<br>
+ i = c;<br>
+ } else<br>
+ emit_mov(pc, dst[c], dst[i]);<br>
}<br>
break;<br>
case TGSI_OPCODE_SCS:<br>
@@ -1491,6 +1560,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
set_src_0(pc, dst[c], e);<br>
emit(pc, e);<br>
}<br>
+ } else if (assimilate) {<br>
+ for (c = 0; c < 4; c++)<br>
+ if (rdst[c])<br>
+ assimilate_temp(pc, rdst[c], dst[c]);<br>
}<br>
<br>
for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {<br>
@@ -1499,10 +1572,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
continue;<br>
if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD)<br>
FREE(src[i][c]);<br>
-<br>
- /* Might also release temporaries not used anymore in this loop,<br>
- * therefore no temp_immd and temp_immd_nr like for temp_temp.<br>
- */<br>
+ else<br>
+ if (src[i][c]->acc == pc->insn_cur)<br>
+ release_hw(pc, src[i][c]);<br>
}<br>
}<br>
<br>
@@ -1996,6 +2068,18 @@ nv50_program_tx(struct nv50_program *p)<br>
e_prev = e;<br>
}<br>
<br>
+ /* adjust register 1988 'heuristically' */<br>
+ /* XXX: make this go away */<br>
+ for (i = 0, k = 0; k < 4; ++k)<br>
+ if (pc->p->cfg.fp.regs[1] & (1 << (24 + k)))<br>
+ i++;<br>
+ if (i > 3 || i < ((pc->p->cfg.fp.regs[1] >> 16) & 0xFF) + 3) {<br>
+ pc->p->cfg.fp.regs[1] &= 0xFFFFFF00;<br>
+ pc->p->cfg.fp.regs[1] |= ((pc->p->cfg.fp.regs[1] >> 16) & 0xFF);<br>
+ } else {<br>
+ pc->p->cfg.fp.regs[1] |= (3 - i);<br>
+ }<br>
+<br>
if (!is_long(pc->p->exec_tail)) {<br>
/* this may occur if moving FP results */<br>
assert(e_prev && !is_long(e_prev));<br>
<br>commit 4411b1e3b3c11c69ec11148783327759a94165e2<br>
Author: chr <chr@LAPTOP.(none)><br>
Date: Wed May 6 11:46:17 2009 +0200<br>
<br>
Enable half insns and immediates for MOV and ADD.<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index e4fc261..2ab7b57 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -426,7 +426,7 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)<br>
static INLINE void<br>
set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
{<br>
- unsigned val = fui(pc->immd_buf[imm->hw]); /* XXX */<br>
+ unsigned val = fui(pc->immd_buf[imm->hw - pc->param_nr * 4]);<br>
<br>
set_long(pc, e);<br>
/*XXX: can't be predicated - bits overlap.. catch cases where both<br>
@@ -505,12 +505,11 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)<br>
<br>
set_dst(pc, dst, e);<br>
<br>
- if (0 && dst->type != P_RESULT && src->type == P_IMMD) {<br>
+ if (pc->allow32 && dst->type != P_RESULT && src->type == P_IMMD) {<br>
set_immd(pc, src, e);<br>
/*XXX: 32-bit, but steals part of "half" reg space - need to<br>
* catch and handle this case if/when we do half-regs<br>
*/<br>
- e->inst[0] |= 0x00008000;<br>
} else<br>
if (src->type == P_IMMD || src->type == P_CONST) {<br>
set_long(pc, e);<br>
@@ -526,13 +525,15 @@ emit_mov(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)<br>
e->inst[0] |= (src->hw << 9);<br>
}<br>
<br>
- /* We really should support "half" instructions here at some point,<br>
- * but I don't feel confident enough about them yet.<br>
- */<br>
- set_long(pc, e);<br>
- if (is_long(e) && !is_immd(e)) {<br>
+ if (!is_long(e) || is_immd(e))<br>
+ e->inst[0] |= 0x00008000;<br>
+ else {<br>
e->inst[1] |= 0x04000000; /* 32-bit */<br>
- e->inst[1] |= 0x0003c000; /* "subsubop" 0xf == mov */<br>
+<br>
+ /* XXX: look into this 0x3 or 0xf again */<br>
+ e->inst[1] |= 0x0000c000; /* "subsubop" 0x3 */<br>
+ if (!(e->inst[1] & 0x20000000))<br>
+ e->inst[1] |= 0x00030000; /* "subsubop" 0xf */<br>
}<br>
<br>
emit(pc, e);<br>
@@ -606,6 +607,7 @@ set_src_1(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)<br>
e->inst[0] |= (src->hw << 16);<br>
}<br>
<br>
+/* XXX: can source 2 really be a constant ? */<br>
static void<br>
set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)<br>
{<br>
@@ -670,7 +672,10 @@ emit_add(struct nv50_pc *pc, struct nv50_reg *dst,<br>
check_swap_src_0_1(pc, &src0, &src1);<br>
set_dst(pc, dst, e);<br>
set_src_0(pc, src0, e);<br>
- if (is_long(e))<br>
+ if (!is_long(e) && src1->type == P_IMMD && pc->allow32)<br>
+ set_immd(pc, src1, e);<br>
+ else<br>
+ if (requires_long(e, src1))<br>
set_src_2(pc, src1, e);<br>
else<br>
set_src_1(pc, src1, e);<br>
@@ -902,6 +907,7 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
struct nv50_reg *neg128 = alloc_immd(pc, -127.999999);<br>
struct nv50_reg *pos128 = alloc_immd(pc, 127.999999);<br>
struct nv50_reg *tmp[4];<br>
+ boolean allow32 = pc->allow32;<br>
<br>
if (mask & (3 << 1)) {<br>
if (mask & (1 << 1))<br>
@@ -911,6 +917,8 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
emit_minmax(pc, 4, tmp[0], src[0], zero);<br>
}<br>
<br>
+ pc->allow32 = FALSE;<br>
+<br>
if (mask & (1 << 2)) {<br>
set_pred_wr(pc, 1, 0, pc->p->exec_tail);<br>
<br>
@@ -926,6 +934,8 @@ emit_lit(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,<br>
set_pred(pc, 3, 0, pc->p->exec_tail);<br>
}<br>
<br>
+ pc->allow32 = allow32;<br>
+<br>
/* do this last, in case src[i,j] == dst[0,3] */<br>
if (mask & (1 << 0))<br>
emit_mov(pc, dst[0], one);<br>
<br>commit 88dbc993e651da91d66c4ca471d11ee5aa2b5085<br>
Author: chr <chr@LAPTOP.(none)><br>
Date: Wed May 6 11:50:17 2009 +0200<br>
<br>
Use multiple (3 for now: PVP, PFP, PMISC) constant buffers.<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c<br>
index 2ab7b57..6e279bd 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.c<br>
+++ b/src/gallium/drivers/nv50/nv50_program.c<br>
@@ -426,7 +426,7 @@ set_dst(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_program_exec *e)<br>
static INLINE void<br>
set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e)<br>
{<br>
- unsigned val = fui(pc->immd_buf[imm->hw - pc->param_nr * 4]);<br>
+ unsigned val = fui(pc->immd_buf[imm->hw]);<br>
<br>
set_long(pc, e);<br>
/*XXX: can't be predicated - bits overlap.. catch cases where both<br>
@@ -478,22 +478,14 @@ set_data(struct nv50_pc *pc, struct nv50_reg *src, unsigned m, unsigned s,<br>
struct nv50_program_exec *e)<br>
{<br>
set_long(pc, e);<br>
-#if 1<br>
- e->inst[1] |= (1 << 22);<br>
-#else<br>
- if (src->type == P_IMMD) {<br>
- e->inst[1] |= (NV50_CB_PMISC << 22);<br>
- } else {<br>
- if (pc->p->type == PIPE_SHADER_VERTEX)<br>
- e->inst[1] |= (NV50_CB_PVP << 22);<br>
- else<br>
- e->inst[1] |= (NV50_CB_PFP << 22);<br>
- }<br>
-#endif<br>
<br>
+ /* XXX: <a href="http://param.bs" target="_blank">param.bs</a> can be extracted from inst[1] */<br>
+ e-><a href="http://param.bs" target="_blank">param.bs</a> = (src->type == P_IMMD) ? 0 : 1;<br>
e->param.index = src->hw;<br>
e->param.shift = s;<br>
e->param.mask = m << (s % 32);<br>
+<br>
+ e->inst[1] |= (e-><a href="http://param.bs" target="_blank">param.bs</a> << 22);<br>
}<br>
<br>
static void<br>
@@ -1502,7 +1494,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)<br>
}<br>
break;<br>
case TGSI_OPCODE_TEX:<br>
- case TGSI_OPCODE_TXP:<br>
+ case TGSI_OPCODE_TXP: /* XXX: TXP should use w-component as iv on interp */<br>
{<br>
struct nv50_reg *t[4];<br>
struct nv50_program_exec *e;<br>
@@ -1977,7 +1969,7 @@ nv50_program_tx_prep(struct nv50_pc *pc)<br>
}<br>
<br>
if (pc->immd_nr) {<br>
- int rid = pc->param_nr * 4;<br>
+ int rid = 0;<br>
<br>
pc->immd = CALLOC(pc->immd_nr * 4, sizeof(struct nv50_reg));<br>
if (!pc->immd)<br>
@@ -2121,7 +2113,7 @@ nv50_program_validate(struct nv50_context *nv50, struct nv50_program *p)<br>
<br>
static void<br>
nv50_program_upload_data(struct nv50_context *nv50, float *map,<br>
- unsigned start, unsigned count)<br>
+ unsigned start, unsigned count, unsigned cbuf)<br>
{<br>
struct nouveau_channel *chan = nv50->screen->nvws->channel;<br>
struct nouveau_grobj *tesla = nv50->screen->tesla;<br>
@@ -2130,7 +2122,7 @@ nv50_program_upload_data(struct nv50_context *nv50, float *map,<br>
unsigned nr = count > 2047 ? 2047 : count;<br>
<br>
BEGIN_RING(chan, tesla, 0x00000f00, 1);<br>
- OUT_RING (chan, (NV50_CB_PMISC << 0) | (start << 8));<br>
+ OUT_RING (chan, (cbuf << 0) | (start << 8));<br>
BEGIN_RING(chan, tesla, 0x40000f04, nr);<br>
OUT_RINGp (chan, map, nr);<br>
<br>
@@ -2145,35 +2137,48 @@ nv50_program_validate_data(struct nv50_context *nv50, struct nv50_program *p)<br>
{<br>
struct nouveau_winsys *nvws = nv50->screen->nvws;<br>
struct pipe_winsys *ws = nv50->pipe.winsys;<br>
- unsigned nr = p->param_nr + p->immd_nr;<br>
<br>
- if (!p->data && nr) {<br>
- struct nouveau_resource *heap = nv50->screen->vp_data_heap;<br>
+ if (!p->data[0] && p->immd_nr) {<br>
+ struct nouveau_resource *heap = nv50->screen->immd_heap[0];<br>
+<br>
+ if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0])) {<br>
+ while (heap->next && heap->size < p->immd_nr) {<br>
+ struct nv50_program *evict = heap->next->priv;<br>
+ nvws->res_free(&evict->data[0]);<br>
+ }<br>
+<br>
+ if (nvws->res_alloc(heap, p->immd_nr, p, &p->data[0]))<br>
+ assert(0);<br>
+ }<br>
+<br>
+ /* immediates only need to be uploaded again when freed */<br>
+ nv50_program_upload_data(nv50, p->immd, p->data[0]->start,<br>
+ p->immd_nr, NV50_CB_PMISC);<br>
+ }<br>
+<br>
+ if (!p->data[1] && p->param_nr) {<br>
+ struct nouveau_resource *heap = nv50->screen->parm_heap[p->type];<br>
<br>
- if (nvws->res_alloc(heap, nr, p, &p->data)) {<br>
- while (heap->next && heap->size < nr) {<br>
+ if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1])) {<br>
+ while (heap->next && heap->size < p->param_nr) {<br>
struct nv50_program *evict = heap->next->priv;<br>
- nvws->res_free(&evict->data);<br>
+ nvws->res_free(&evict->data[1]);<br>
}<br>
<br>
- if (nvws->res_alloc(heap, nr, p, &p->data))<br>
+ if (nvws->res_alloc(heap, p->param_nr, p, &p->data[1]))<br>
assert(0);<br>
}<br>
}<br>
<br>
if (p->param_nr) {<br>
+ unsigned cbuf;<br>
float *map = ws->buffer_map(ws, nv50->constbuf[p->type],<br>
PIPE_BUFFER_USAGE_CPU_READ);<br>
- nv50_program_upload_data(nv50, map, p->data->start,<br>
- p->param_nr);<br>
+ cbuf = (p->type == PIPE_SHADER_VERTEX) ? NV50_CB_PVP : NV50_CB_PFP;<br>
+ nv50_program_upload_data(nv50, map, p->data[1]->start,<br>
+ p->param_nr, cbuf);<br>
ws->buffer_unmap(ws, nv50->constbuf[p->type]);<br>
}<br>
-<br>
- if (p->immd_nr) {<br>
- nv50_program_upload_data(nv50, p->immd,<br>
- p->data->start + p->param_nr,<br>
- p->immd_nr);<br>
- }<br>
}<br>
<br>
static void<br>
@@ -2193,20 +2198,26 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)<br>
upload = TRUE;<br>
}<br>
<br>
- if (p->data && p->data->start != p->data_start) {<br>
+ if ((p->data[0] && p->data[0]->start != p->data_start[0]) ||<br>
+ (p->data[1] && p->data[1]->start != p->data_start[1]))<br>
+ {<br>
for (e = p->exec_head; e; e = e->next) {<br>
unsigned ei, ci;<br>
<br>
if (e->param.index < 0)<br>
continue;<br>
ei = e->param.shift >> 5;<br>
- ci = e->param.index + p->data->start;<br>
+ ci = e->param.index + p->data[e-><a href="http://param.bs" target="_blank">param.bs</a>]->start;<br>
<br>
e->inst[ei] &= ~e->param.mask;<br>
e->inst[ei] |= (ci << e->param.shift);<br>
}<br>
<br>
- p->data_start = p->data->start;<br>
+ if (p->data[0])<br>
+ p->data_start[0] = p->data[0]->start;<br>
+ if (p->data[1])<br>
+ p->data_start[1] = p->data[1]->start;<br>
+<br>
upload = TRUE;<br>
}<br>
<br>
@@ -2364,7 +2375,8 @@ nv50_program_destroy(struct nv50_context *nv50, struct nv50_program *p)<br>
if (p->buffer)<br>
pipe_buffer_reference(&p->buffer, NULL);<br>
<br>
- nv50->screen->nvws->res_free(&p->data);<br>
+ nv50->screen->nvws->res_free(&p->data[0]);<br>
+ nv50->screen->nvws->res_free(&p->data[1]);<br>
<br>
p->translated = 0;<br>
}<br>
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h<br>
index 3b3b6bb..9dd0f37 100644<br>
--- a/src/gallium/drivers/nv50/nv50_program.h<br>
+++ b/src/gallium/drivers/nv50/nv50_program.h<br>
@@ -10,6 +10,7 @@ struct nv50_program_exec {<br>
unsigned inst[2];<br>
struct {<br>
int index;<br>
+ int bs; /* buffer selector */<br>
unsigned mask;<br>
unsigned shift;<br>
} param;<br>
@@ -24,8 +25,8 @@ struct nv50_program {<br>
struct nv50_program_exec *exec_head;<br>
struct nv50_program_exec *exec_tail;<br>
unsigned exec_size;<br>
- struct nouveau_resource *data;<br>
- unsigned data_start;<br>
+ struct nouveau_resource *data[2];<br>
+ unsigned data_start[2];<br>
<br>
struct pipe_buffer *buffer;<br>
<br>
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c<br>
index 2980564..268eeeb 100644<br>
--- a/src/gallium/drivers/nv50/nv50_screen.c<br>
+++ b/src/gallium/drivers/nv50/nv50_screen.c<br>
@@ -290,20 +290,61 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_winsys *nvws)<br>
so_method(so, screen->tesla, 0x16b8, 1);<br>
so_data (so, 8);<br>
<br>
- /* Shared constant buffer */<br>
- screen->constbuf = screen->pipe.buffer_create(&screen->pipe, 0, 0, 128 * 4 * 4);<br>
- if (nvws->res_init(&screen->vp_data_heap, 0, 128)) {<br>
- NOUVEAU_ERR("Error initialising constant buffer\n");<br>
+ /* constant buffers for immediates and VP/FP parameters */<br>
+ screen->constbuf_misc[0] =<br>
+ screen->pipe.buffer_create(&screen->pipe, 0, 0, 128 * 4 * 4);<br>
+<br>
+ screen->constbuf_parm[0] =<br>
+ screen->pipe.buffer_create(&screen->pipe, 0, 0, 128 * 4 * 4);<br>
+<br>
+ screen->constbuf_parm[1] =<br>
+ screen->pipe.buffer_create(&screen->pipe, 0, 0, 128 * 4 * 4);<br>
+<br>
+ if (nvws->res_init(&screen->immd_heap[0], 0, 128) ||<br>
+ nvws->res_init(&screen->parm_heap[0], 0, 128) ||<br>
+ nvws->res_init(&screen->parm_heap[1], 0, 128))<br>
+ {<br>
+ NOUVEAU_ERR("Error initialising constant buffers.\n");<br>
nv50_screen_destroy(&screen->pipe);<br>
return NULL;<br>
}<br>
<br>
so_method(so, screen->tesla, 0x1280, 3);<br>
- so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |<br>
+ so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |<br>
NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);<br>
- so_reloc (so, screen->constbuf, 0, NOUVEAU_BO_VRAM |<br>
+ so_reloc (so, screen->constbuf_misc[0], 0, NOUVEAU_BO_VRAM |<br>
NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);<br>
- so_data (so, (NV50_CB_PMISC << 16) | 0x00001000);<br>
+ so_data (so, (NV50_CB_PMISC << 16) | 0x00000800);<br>
+ so_method(so, screen->tesla, 0x1694, 1);<br>
+ so_data (so, 0x00000001 | (NV50_CB_PMISC << 12));<br>
+ so_method(so, screen->tesla, 0x1694, 1);<br>
+ so_data (so, 0x00000031 | (NV50_CB_PMISC << 12));<br>
+<br>
+ so_method(so, screen->tesla, 0x1280, 3);<br>
+ so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |<br>
+ NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);<br>
+ so_reloc (so, screen->constbuf_parm[0], 0, NOUVEAU_BO_VRAM |<br>
+ NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);<br>
+ so_data (so, (NV50_CB_PVP << 16) | 0x00000800);<br>
+ so_method(so, screen->tesla, 0x1694, 1);<br>
+ so_data (so, 0x00000101 | (NV50_CB_PVP << 12));<br>
+<br>
+ so_method(so, screen->tesla, 0x1280, 3);<br>
+ so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |<br>
+ NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0);<br>
+ so_reloc (so, screen->constbuf_parm[1], 0, NOUVEAU_BO_VRAM |<br>
+ NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0);<br>
+ so_data (so, (NV50_CB_PMISC << 16) | 0x00000800);<br>
+ so_method(so, screen->tesla, 0x1694, 1);<br>
+ so_data (so, 0x00000131 | (NV50_CB_PFP << 12));<br>
+<br>
+ /*<br>
+ // map VP and FP CB index 0 to CB_PMISC<br>
+ so_method(so, screen->tesla, 0x1694, 1);<br>
+ so_data (so, 0x000BBNP1 = 0x00000001);<br>
+ so_method(so, screen->tesla, 0x1694, 1);<br>
+ so_data (so, 0x000BBNP1 = 0x00000031);<br>
+ */<br>
<br>
/* Texture sampler/image unit setup - we abuse the constant buffer<br>
* upload mechanism for the moment to upload data to the tex config<br>
diff --git a/src/gallium/drivers/nv50/nv50_screen.h b/src/gallium/drivers/nv50/nv50_screen.h<br>
index db567aa..31b8ef2 100644<br>
--- a/src/gallium/drivers/nv50/nv50_screen.h<br>
+++ b/src/gallium/drivers/nv50/nv50_screen.h<br>
@@ -15,8 +15,11 @@ struct nv50_screen {<br>
struct nouveau_grobj *m2mf;<br>
struct nouveau_notifier *sync;<br>
<br>
- struct pipe_buffer *constbuf;<br>
- struct nouveau_resource *vp_data_heap;<br>
+ struct pipe_buffer *constbuf_misc[1];<br>
+ struct pipe_buffer *constbuf_parm[2];<br>
+<br>
+ struct nouveau_resource *immd_heap[1];<br>
+ struct nouveau_resource *parm_heap[2];<br>
<br>
struct pipe_buffer *tic;<br>
struct pipe_buffer *tsc;<br>
<br>_______________________________________________<br>
Nouveau mailing list<br>
<a href="mailto:Nouveau@lists.freedesktop.org" target="_blank">Nouveau@lists.freedesktop.org</a><br>
<a href="http://lists.freedesktop.org/mailman/listinfo/nouveau" target="_blank">http://lists.freedesktop.org/mailman/listinfo/nouveau</a><br>
<br></blockquote></div><br>