commit ebcc4b9cf61a25d8ef2fa87eecfb5e4e75b47bca Author: chr Date: Tue May 5 20:56:12 2009 +0200 - more correct loading FP interpolants, also consider interpolation mode - use tgsi resource nv50_regs to store attributes - improve values of shader registers - make sure FP depth output goes where it's supposed to go - loop through all instructions and make sure there are no single half insns diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index cb92a31..9acf882 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -445,20 +445,29 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) #define INTERP_PERSPECTIVE 2 #define INTERP_CENTROID 4 +/* interpolant index has been stored in dst->rhw */ static void -emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, - struct nv50_reg *src, struct nv50_reg *iv) +emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *iv, + unsigned mode) { + assert(dst->rhw != -1); struct nv50_program_exec *e = exec(pc); e->inst[0] |= 0x80000000; set_dst(pc, dst, e); - alloc_reg(pc, src); - e->inst[0] |= (src->hw << 16); - if (iv) { - e->inst[0] |= (1 << 25); - alloc_reg(pc, iv); - e->inst[0] |= (iv->hw << 9); + e->inst[0] |= (dst->rhw << 16); + + if (mode & INTERP_FLAT) { + e->inst[0] |= (1 << 8); + } else { + if (mode & INTERP_PERSPECTIVE) { + e->inst[0] |= (1 << 25); + alloc_reg(pc, iv); + e->inst[0] |= (iv->hw << 9); + } + + if (mode & INTERP_CENTROID) + e->inst[0] |= (1 << 24); } emit(pc, e); @@ -982,6 +991,43 @@ emit_nop(struct nv50_pc *pc, boolean l) emit(pc, e); } +static void +convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e) +{ + unsigned q = 0, m = ~0; + + assert(!is_long(e)); + + switch (e->inst[0] >> 28) { + case 0x1: + /* MOV */ + q = 0x0403c000; + m = 0xFFFF7FFF; + break; + case 0x8: + /* INTERP */ + m = ~0x02000000; + if (e->inst[0] & 0x02000000) + q = 0x00020000; + break; + case 0xC: + /* MUL */ + break; + case 0x9: + /* RCP */ + break; + default: + assert(0); + break; + } + + set_long(pc, e); + pc->p->exec_size++; + + e->inst[0] &= m; + e->inst[1] |= q; +} + /* Adjust a bitmask that indicates what components of a source are used, * we use this in tx_prep so we only load interpolants that are needed. */ @@ -1005,20 +1051,21 @@ insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask) case TGSI_OPCODE_RSQ: *mask = 0x1; break; - case TGSI_OPCODE_TXP: - *mask = 0x8; - /* fall through to TEX */ case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXP: assert(insn->Instruction.Extended); tex = &insn->InstructionExtTexture; + *mask = 0x7; if (tex->Texture == TGSI_TEXTURE_1D) - *mask |= 0x1; + *mask = 0x1; else if (tex->Texture == TGSI_TEXTURE_2D) - *mask |= 0x3; - else - *mask |= 0x7; + *mask = 0x3; + + if (insn->Instruction.Opcode == TGSI_OPCODE_TXP) + *mask |= 0x8; + break; default: break; } @@ -1255,6 +1302,7 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) emit_kil(pc, src[0][1]); emit_kil(pc, src[0][2]); emit_kil(pc, src[0][3]); + pc->p->cfg.fp.regs[2] |= 0x00100000; break; case TGSI_OPCODE_LIT: emit_lit(pc, &dst[0], mask, &src[0][0]); @@ -1503,7 +1551,7 @@ nv50_program_tx_prep(struct nv50_pc *pc) unsigned *last_t_use = NULL; unsigned *last_a_use = NULL; - depr = fcol = bcol = fcrd = 0xFFFFFFFF; + depr = fcol = bcol = fcrd = 0xFFFF; if (pc->p->type == PIPE_SHADER_FRAGMENT) { pc->p->cfg.fp.regs[0] = 0x01000404; @@ -1683,37 +1731,106 @@ nv50_program_tx_prep(struct nv50_pc *pc) for (i = 0; i < pc->temp_nr; i++) { for (c = 0; c < 4; c++) { pc->temp[i*4+c].type = P_TEMP; - pc->temp[i*4+c].hw = -1; + pc->temp[i*4+c].hw = pc->temp[i*4+c].rhw = -1; pc->temp[i*4+c].index = i; + pc->temp[i*4+c].acc = last_t_use[i*4+c]; } } } if (pc->attr_nr) { - struct nv50_reg *iv = NULL; - int aid = 0; + struct nv50_reg *iv, *iv_c = NULL, *iv_p = NULL; + int oid, off = 4, mid = 0, aid = 0; + + /* off = VP output id offset to i*4 (oid = i*4 + off + c) + * aid = FP attribute/interpolant id (incremented only for used attrs) + * mid = VP output mapping field ID (HPOS not counted) + */ pc->attr = CALLOC(pc->attr_nr * 4, sizeof(struct nv50_reg)); if (!pc->attr) goto out_err; + i = 0; if (pc->p->type == PIPE_SHADER_FRAGMENT) { - iv = alloc_temp(pc, NULL); - emit_interp(pc, iv, iv, NULL); - emit_flop(pc, 0, iv, iv); - aid++; + + if (fcrd != 0xFFFF) { + assert(fcrd == 0); /* position input should always be 0 */ + i = 1; + off = 0; + for (c = 0; c < 4; ++c) { + if (last_a_use[c] == 0) + continue; + + pc->attr[c].index = fcrd; + pc->attr[c].type = P_TEMP; + pc->attr[c].acc = last_a_use[c]; + pc->attr[c].hw = pc->attr[c].rhw = -1; + + alloc_reg(pc, &pc->attr[c]); + pc->attr[c].rhw = aid++; + + emit_interp(pc, &pc->attr[c], NULL, INTERP_LINEAR); + pc->p->cfg.fp.regs[1] |= (1 << (24 + c)); + + switch (c) { + case 0: + case 1: + /* should probably do viewport stuff here */ + break; + case 3: + iv_p = &pc->attr[c]; + emit_flop(pc, 0, iv_p, iv_p); + break; + default: + break; + } + } + } + + if (perspect_load && !iv_p) { + iv_p = alloc_temp(pc, NULL); + iv_p->rhw = aid++; + emit_interp(pc, iv_p, NULL, INTERP_LINEAR); + emit_flop(pc, 0, iv_p, iv_p); + pc->p->cfg.fp.regs[1] |= 0x08000000; + } + + if (centroid_load) { + iv_c = alloc_temp(pc, NULL); + iv_c->rhw = iv_p ? aid - 1 : aid++; + emit_interp(pc, iv_c, NULL, INTERP_CENTROID); + emit_flop(pc, 0, iv_c, iv_c); + pc->p->cfg.fp.regs[1] |= 0x08000000; + } } - for (i = 0; i < pc->attr_nr; i++) { + for (; i < pc->attr_nr; i++) { struct nv50_reg *a = &pc->attr[i*4]; + iv = (interp_mode[i] & INTERP_CENTROID) ? iv_c : iv_p; for (c = 0; c < 4; c++) { if (pc->p->type == PIPE_SHADER_FRAGMENT) { - struct nv50_reg *at = - alloc_temp(pc, NULL); - pc->attr[i*4+c].type = at->type; - pc->attr[i*4+c].hw = at->hw; - pc->attr[i*4+c].index = at->index; + a[c].hw = a[c].rhw = -1; + a[c].index = -1; + if (last_a_use[i*4+c] == 0) + continue; + + if (i == fcol || i == bcol) + pc->p->cfg.fp.regs[0] += 0x00010000; + pc->p->cfg.fp.regs[1] += 0x00010000; + + a[c].index = i; + a[c].type = P_TEMP; + a[c].acc = last_a_use[i*4+c]; + + alloc_reg(pc, &a[c]); + a[c].rhw = aid++; + emit_interp(pc, &a[c], iv, interp_mode[i]); + + oid = off + i * 4 + c; + pc->p->cfg.fp.map[mid / 4] |= oid << (8 * (mid % 4)); + mid++; } else { pc->p->cfg.vp.attr[aid/32] |= (1 << (aid % 32)); @@ -1722,18 +1839,16 @@ nv50_program_tx_prep(struct nv50_pc *pc) pc->attr[i*4+c].index = i; } } + } - if (pc->p->type != PIPE_SHADER_FRAGMENT) - continue; + if (pc->p->type == PIPE_SHADER_FRAGMENT) { + pc->p->cfg.fp.high_map = (mid / 4) + ((mid % 4) ? 1 : 0); - emit_interp(pc, &a[0], &a[0], iv); - emit_interp(pc, &a[1], &a[1], iv); - emit_interp(pc, &a[2], &a[2], iv); - emit_interp(pc, &a[3], &a[3], iv); + if (iv_p && iv_p->index == -1) + free_temp(pc, iv_p); + if (iv_c) + free_temp(pc, iv_c); } - - if (iv) - free_temp(pc, iv); } if (pc->result_nr) { @@ -1748,9 +1863,15 @@ nv50_program_tx_prep(struct nv50_pc *pc) if (pc->p->type == PIPE_SHADER_FRAGMENT) { pc->result[i*4+c].type = P_TEMP; pc->result[i*4+c].hw = -1; + if (i == depr) { + pc->result[i*4+c].rhw = ((c == 2) ? + (pc->result_nr - 1) * 4 : -1); + } else + pc->result[i*4+c].rhw = rid++; } else { pc->result[i*4+c].type = P_RESULT; pc->result[i*4+c].hw = rid++; + pc->result[i*4+c].rhw = -1; } pc->result[i*4+c].index = i; } @@ -1805,6 +1926,7 @@ nv50_program_tx(struct nv50_program *p) { struct tgsi_parse_context parse; struct nv50_pc *pc; + unsigned i, k; boolean ret; pc = CALLOC_STRUCT(nv50_pc); @@ -1843,10 +1965,42 @@ nv50_program_tx(struct nv50_program *p) if (p->type == PIPE_SHADER_FRAGMENT) { struct nv50_reg out; - out.type = P_TEMP; - for (out.hw = 0; out.hw < pc->result_nr * 4; out.hw++) - emit_mov(pc, &out, &pc->result[out.hw]); + + for (i = 0; i < pc->result_nr * 4; i++) { + if (pc->result[i].rhw == -1) + continue; + if (pc->result[i].hw != pc->result[i].rhw) { + out.hw = pc->result[i].rhw; + emit_mov(pc, &out, &pc->result[i]); + } + if (pc->p->cfg.high_result < pc->result[i].rhw + 1) + pc->p->cfg.high_result = pc->result[i].rhw + 1; + } + } + + /* look for single half instructions and make them long */ + struct nv50_program_exec *e, *e_prev; + + for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) { + if (!is_long(e)) + k++; + + if (!e->next || is_long(e->next)) { + if (k & 1) + convert_to_long(pc, e); + k = 0; + } + + if (e->next) + e_prev = e; + } + + if (!is_long(pc->p->exec_tail)) { + /* this may occur if moving FP results */ + assert(e_prev && !is_long(e_prev)); + convert_to_long(pc, e_prev); + convert_to_long(pc, pc->p->exec_tail); } assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head)); @@ -1973,7 +2127,7 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p) if (is_long(e)) NOUVEAU_ERR("0x%08x\n", e->inst[1]); } - + FREE(up); #endif up = ptr = MALLOC(p->exec_size * 4); @@ -2058,6 +2212,7 @@ nv50_fragprog_validate(struct nv50_context *nv50) struct nouveau_grobj *tesla = nv50->screen->tesla; struct nv50_program *p = nv50->fragprog; struct nouveau_stateobj *so; + unsigned i; if (!p->translated) { nv50_program_validate(nv50, p); @@ -2068,24 +2223,30 @@ nv50_fragprog_validate(struct nv50_context *nv50) nv50_program_validate_data(nv50, p); nv50_program_validate_code(nv50, p); - so = so_new(64, 2); + so = so_new(32, 2); so_method(so, tesla, NV50TCL_FP_ADDRESS_HIGH, 2); so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_HIGH, 0, 0); so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD | NOUVEAU_BO_LOW, 0, 0); so_method(so, tesla, 0x1904, 4); - so_data (so, 0x00040404); /* p: 0x01000404 */ + so_data (so, p->cfg.fp.regs[0]); /* 0x01000404 / 0x00040404 etc. */ so_data (so, 0x00000004); so_data (so, 0x00000000); so_data (so, 0x00000000); - so_method(so, tesla, 0x16bc, 3); /*XXX: fixme */ + so_method(so, tesla, 0x16bc, 1 + p->cfg.fp.high_map); so_data (so, 0x03020100); - so_data (so, 0x07060504); - so_data (so, 0x0b0a0908); + for (i = 0; i < p->cfg.fp.high_map; i++) + so_data(so, p->cfg.fp.map[i]); so_method(so, tesla, 0x1988, 2); - so_data (so, 0x08080408); //0x08040404); /* p: 0x0f000401 */ + so_data (so, p->cfg.fp.regs[1]); /* 0x08040404 / 0x0f000401 etc. */ so_data (so, p->cfg.high_temp); + so_method(so, tesla, 0x1298, 1); + so_data (so, p->cfg.high_result); + so_method(so, tesla, 0x19a8, 1); + so_data (so, p->cfg.fp.regs[2]); + so_method(so, tesla, 0x196c, 1); + so_data (so, p->cfg.fp.regs[3]); so_method(so, tesla, 0x1414, 1); so_data (so, 0); /* program start offset */ so_ref(so, &nv50->state.fragprog);