commit dacf2f879d63b5bf756da62eee901379336e7335 Author: chr Date: Tue May 5 20:57:15 2009 +0200 - avoid overwriting sources before they're used in cases where dst == src - add magical adjustment for register 1988 (I should find out how that really works) diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 9acf882..e4fc261 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -1162,12 +1162,40 @@ tgsi_src(struct nv50_pc *pc, int chan, const struct tgsi_full_src_register *src) return r; } +/* returns TRUE if instruction can overwrite sources before they're read */ +static boolean +direct2dest_op(const struct tgsi_full_instruction *insn) +{ + if (insn->Instruction.Saturate) + return FALSE; + + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_COS: + case TGSI_OPCODE_DP3: + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + case TGSI_OPCODE_KIL: + case TGSI_OPCODE_LIT: + case TGSI_OPCODE_POW: + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + case TGSI_OPCODE_SCS: + case TGSI_OPCODE_SIN: + case TGSI_OPCODE_TEX: + case TGSI_OPCODE_TXP: + return FALSE; + default: + return TRUE; + } +} + static boolean nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) { const struct tgsi_full_instruction *inst = &tok->FullInstruction; struct nv50_reg *rdst[4], *dst[4], *src[3][4], *temp = NULL; unsigned mask, sat, unit; + boolean assimilate = FALSE; int i, c; mask = inst->FullDstRegisters[0].DstRegister.WriteMask; @@ -1178,6 +1206,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) dst[c] = tgsi_dst(pc, c, &inst->FullDstRegisters[0]); else dst[c] = NULL; + + rdst[c] = NULL; + + src[0][c] = NULL; + src[1][c] = NULL; + src[2][c] = NULL; } for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { @@ -1195,8 +1229,35 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) rdst[c] = dst[c]; dst[c] = temp_temp(pc); } + } else if (direct2dest_op(inst)) { + for (c = 0; c < 4; c++) { + if (!dst[c] || dst[c]->type != P_TEMP) + continue; + + for (i = c + 1; i < 4; i++) { + if (dst[c] == src[0][i] || + dst[c] == src[1][i] || + dst[c] == src[2][i]) + break; + } + if (i == 4) + continue; + + assimilate = TRUE; + rdst[c] = dst[c]; + dst[c] = alloc_preferred_temp(pc, rdst[c]->rhw); + } + } else if (inst->Instruction.Opcode == TGSI_OPCODE_LIT) { + /* XXX: shouldn't give LIT an extra case here */ + if (src[0][1] == dst[1] || + src[0][3] == dst[1]) { + assimilate = TRUE; + rdst[1] = dst[1]; + dst[1] = alloc_temp(pc, NULL); + } } + i = -1; switch (inst->Instruction.Opcode) { case TGSI_OPCODE_ABS: for (c = 0; c < 4; c++) { @@ -1373,14 +1434,22 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - emit_flop(pc, 0, dst[c], src[0][0]); + if (i == -1) { + emit_flop(pc, 0, dst[c], src[0][0]); + i = c; + } else + emit_mov(pc, dst[c], dst[i]); } break; case TGSI_OPCODE_RSQ: for (c = 0; c < 4; c++) { if (!(mask & (1 << c))) continue; - emit_flop(pc, 2, dst[c], src[0][0]); + if (i == -1) { + emit_flop(pc, 2, dst[c], src[0][0]); + i = c; + } else + emit_mov(pc, dst[c], dst[i]); } break; case TGSI_OPCODE_SCS: @@ -1491,6 +1560,10 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) set_src_0(pc, dst[c], e); emit(pc, e); } + } else if (assimilate) { + for (c = 0; c < 4; c++) + if (rdst[c]) + assimilate_temp(pc, rdst[c], dst[c]); } for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { @@ -1499,10 +1572,9 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) continue; if (src[i][c]->index == -1 && src[i][c]->type == P_IMMD) FREE(src[i][c]); - - /* Might also release temporaries not used anymore in this loop, - * therefore no temp_immd and temp_immd_nr like for temp_temp. - */ + else + if (src[i][c]->acc == pc->insn_cur) + release_hw(pc, src[i][c]); } } @@ -1996,6 +2068,18 @@ nv50_program_tx(struct nv50_program *p) e_prev = e; } + /* adjust register 1988 'heuristically' */ + /* XXX: make this go away */ + for (i = 0, k = 0; k < 4; ++k) + if (pc->p->cfg.fp.regs[1] & (1 << (24 + k))) + i++; + if (i > 3 || i < ((pc->p->cfg.fp.regs[1] >> 16) & 0xFF) + 3) { + pc->p->cfg.fp.regs[1] &= 0xFFFFFF00; + pc->p->cfg.fp.regs[1] |= ((pc->p->cfg.fp.regs[1] >> 16) & 0xFF); + } else { + pc->p->cfg.fp.regs[1] |= (3 - i); + } + if (!is_long(pc->p->exec_tail)) { /* this may occur if moving FP results */ assert(e_prev && !is_long(e_prev));