commit 93d8cfb3e13179d6ed28c4989cefc92389008f0b Author: chr Date: Tue May 5 20:54:43 2009 +0200 - extend nv50_pc to track insn nr, add allow half insn boolean - extend nv50_reg to record insn of last use and FP output hw index - add some functions for later use - modify alloc_reg to prefer final FP output hw if set - record interpolation mode in tx_prep - count number of insns in tx_prep - record depth output, and position and color input indices - inspect instructions for register usage - set pc->allow32 to FALSE on first and last insn shouldn't change generated shader code yet diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c index 1a94327..cb92a31 100644 --- a/src/gallium/drivers/nv50/nv50_program.c +++ b/src/gallium/drivers/nv50/nv50_program.c @@ -86,6 +86,9 @@ struct nv50_reg { int hw; int neg; + + int rhw; /* result hw for FP outputs */ + int acc; /* instruction where this reg is last read (first insn == 1) */ }; struct nv50_pc { @@ -109,6 +112,12 @@ struct nv50_pc { struct nv50_reg *temp_temp[16]; unsigned temp_temp_nr; + + /* current instruction and total number of insns */ + unsigned insn_cur; + unsigned insn_nr; + + boolean allow32; /* TRUE when half insns are allowed */ }; static void @@ -132,7 +141,24 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg) return; } - for (i = 0; i < NV50_SU_MAX_TEMP; i++) { + i = 0; + if (reg->rhw != -1) { + /* try to allocate temporary with index rhw first */ + if (!(pc->r_temp[reg->rhw])) { + pc->r_temp[reg->rhw] = reg; + reg->hw = reg->rhw; + if (pc->p->cfg.high_temp < (reg->rhw + 1)) + pc->p->cfg.high_temp = reg->rhw + 1; + return; + } + /* If we can't allocate the final destination index of the output, + * put it in a high temporary so we need not shuffle around later. + * (like, $r0 needs to go in $r1 and $r1 in $r0 etc.) + */ + i = pc->result_nr * 4; + } + + for (; i < NV50_SU_MAX_TEMP; i++) { if (!(pc->r_temp[i])) { pc->r_temp[i] = reg; reg->hw = i; @@ -160,6 +186,7 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) r->type = P_TEMP; r->index = -1; r->hw = i; + r->rhw = -1; pc->r_temp[i] = r; return r; } @@ -169,6 +196,56 @@ alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst) return NULL; } +static struct nv50_reg * +alloc_preferred_temp(struct nv50_pc *pc, int hw) +{ + struct nv50_reg *r; + + if (hw >= NV50_SU_MAX_TEMP || hw == -1 || pc->r_temp[hw]) + return alloc_temp(pc, NULL); + + r = CALLOC_STRUCT(nv50_reg); + r->type = P_TEMP; + r->index = -1; + r->hw = hw; + r->rhw = -1; + pc->r_temp[hw] = r; + + return r; +} + +/* Assign the hw of the discarded temporary register src + * to the tgsi register dst and free src. + */ +static void +assimilate_temp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src) +{ + assert(dst->index != -1 && src->index == -1 && src->hw != -1); + + if (dst->hw != -1) + pc->r_temp[dst->hw] = NULL; + pc->r_temp[src->hw] = dst; + dst->hw = src->hw; + + FREE(src); +} + +/* release the hardware resource held by r */ +static void +release_hw(struct nv50_pc *pc, struct nv50_reg *r) +{ + assert(r->type == P_TEMP); + if (r->hw == -1) + return; + + assert(pc->r_temp[r->hw] == r); + pc->r_temp[r->hw] = NULL; + + r->acc = 0; + if (r->index == -1) + FREE(r); +} + static void free_temp(struct nv50_pc *pc, struct nv50_reg *r) { @@ -251,7 +328,14 @@ alloc_immd(struct nv50_pc *pc, float f) struct nv50_reg *r = CALLOC_STRUCT(nv50_reg); unsigned hw; - hw = ctor_immd(pc, f, 0, 0, 0) * 4; + /* don't allocate more space if the value is already there */ + for (hw = 0; hw < pc->immd_nr * 4; ++hw) + if (pc->immd_buf[hw] == f) + break; + + if (hw == pc->immd_nr * 4) + hw = ctor_immd(pc, f, 0, 0, 0) * 4; + r->type = P_IMMD; r->hw = hw; r->index = -1; @@ -355,6 +439,12 @@ set_immd(struct nv50_pc *pc, struct nv50_reg *imm, struct nv50_program_exec *e) e->inst[1] |= (val >> 6) << 2; } + +#define INTERP_LINEAR 0 +#define INTERP_FLAT 1 +#define INTERP_PERSPECTIVE 2 +#define INTERP_CENTROID 4 + static void emit_interp(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src, struct nv50_reg *iv) @@ -535,6 +625,14 @@ set_src_2(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e) e->inst[1] |= (src->hw << 14); } +static boolean +requires_long(struct nv50_program_exec *e, struct nv50_reg *src) +{ + if (is_long(e) || src->type == P_IMMD || src->type == P_CONST) + return TRUE; + return FALSE; +} + static void emit_mul(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src0, struct nv50_reg *src1) @@ -870,6 +968,62 @@ emit_kil(struct nv50_pc *pc, struct nv50_reg *src) emit(pc, e); } +static void +emit_nop(struct nv50_pc *pc, boolean l) +{ + struct nv50_program_exec *e = exec(pc); + + e->inst[0] = 0xF0000000; + if (l) { + set_long(pc, e); + e->inst[1] = 0xE0000000; + } + + emit(pc, e); +} + +/* Adjust a bitmask that indicates what components of a source are used, + * we use this in tx_prep so we only load interpolants that are needed. + */ +static void +insn_adjust_mask(const struct tgsi_full_instruction *insn, unsigned *mask) +{ + const struct tgsi_instruction_ext_texture *tex; + + switch (insn->Instruction.Opcode) { + case TGSI_OPCODE_DP3: + *mask = 0x7; + break; + case TGSI_OPCODE_DP4: + case TGSI_OPCODE_DPH: + *mask = 0xF; + break; + case TGSI_OPCODE_LIT: + *mask = 0xB; + break; + case TGSI_OPCODE_RCP: + case TGSI_OPCODE_RSQ: + *mask = 0x1; + break; + case TGSI_OPCODE_TXP: + *mask = 0x8; + /* fall through to TEX */ + case TGSI_OPCODE_TEX: + assert(insn->Instruction.Extended); + tex = &insn->InstructionExtTexture; + + if (tex->Texture == TGSI_TEXTURE_1D) + *mask |= 0x1; + else + if (tex->Texture == TGSI_TEXTURE_2D) + *mask |= 0x3; + else + *mask |= 0x7; + default: + break; + } +} + static struct nv50_reg * tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst) { @@ -1308,12 +1462,53 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok) return TRUE; } +static void +set_acc_array(unsigned *p, const struct tgsi_full_src_register *src, + unsigned mask, unsigned n) +{ + unsigned k, c; + + for (c = 0; c < 4; c++) { + if (!(mask & (1 << c))) + continue; + + k = tgsi_util_get_full_src_register_extswizzle(src, c); + switch (k) { + case TGSI_EXTSWIZZLE_X: + case TGSI_EXTSWIZZLE_Y: + case TGSI_EXTSWIZZLE_Z: + case TGSI_EXTSWIZZLE_W: + p[src->SrcRegister.Index * 4 + k] = n; + break; + default: + break; + } + } +} + static boolean nv50_program_tx_prep(struct nv50_pc *pc) { struct tgsi_parse_context p; boolean ret = FALSE; unsigned i, c; + unsigned fcol, bcol, fcrd, depr; + + /* record interpolation mode from declaration */ + boolean centroid_load = FALSE; + boolean perspect_load = FALSE; + unsigned interp_mode[32]; + + /* track register usage for temps and attrs */ + unsigned *last_t_use = NULL; + unsigned *last_a_use = NULL; + + depr = fcol = bcol = fcrd = 0xFFFFFFFF; + + if (pc->p->type == PIPE_SHADER_FRAGMENT) { + pc->p->cfg.fp.regs[0] = 0x01000404; + pc->p->cfg.fp.regs[1] = 0x00000400; + } tgsi_parse_init(&p, pc->p->pipe.tokens); while (!tgsi_parse_end_of_tokens(&p)) { @@ -1326,6 +1521,10 @@ nv50_program_tx_prep(struct nv50_pc *pc) const struct tgsi_full_immediate *imm = &p.FullToken.FullImmediate; +#ifdef NV50_PROGRAM_DUMP + tgsi_dump_immediate(imm); +#endif + ctor_immd(pc, imm->u.ImmediateFloat32[0].Float, imm->u.ImmediateFloat32[1].Float, imm->u.ImmediateFloat32[2].Float, @@ -1335,11 +1534,16 @@ nv50_program_tx_prep(struct nv50_pc *pc) case TGSI_TOKEN_TYPE_DECLARATION: { const struct tgsi_full_declaration *d; - unsigned last; + unsigned last, first, mode; d = &p.FullToken.FullDeclaration; + first = d->DeclarationRange.First; last = d->DeclarationRange.Last; +#ifdef NV50_PROGRAM_DUMP + tgsi_dump_declaration(d); +#endif + switch (d->Declaration.File) { case TGSI_FILE_TEMPORARY: if (pc->temp_nr < (last + 1)) @@ -1348,10 +1552,71 @@ nv50_program_tx_prep(struct nv50_pc *pc) case TGSI_FILE_OUTPUT: if (pc->result_nr < (last + 1)) pc->result_nr = last + 1; + + if (!d->Declaration.Semantic) + break; + + switch (d->Semantic.SemanticName) { + case TGSI_SEMANTIC_POSITION: + depr = first; + pc->p->cfg.fp.regs[2] |= 0x00000100; + pc->p->cfg.fp.regs[3] |= 0x00000011; + break; + default: + break; + } break; case TGSI_FILE_INPUT: + { if (pc->attr_nr < (last + 1)) pc->attr_nr = last + 1; + + if (pc->p->type != PIPE_SHADER_FRAGMENT) + break; + + switch (d->Declaration.Interpolate) { + case TGSI_INTERPOLATE_CONSTANT: + mode = INTERP_FLAT; + break; + case TGSI_INTERPOLATE_PERSPECTIVE: + mode = INTERP_PERSPECTIVE; + perspect_load = TRUE; + break; + default: + mode = INTERP_LINEAR; + break; + } + + if (d->Declaration.Semantic) { + switch (d->Semantic.SemanticName) { + case TGSI_SEMANTIC_POSITION: + fcrd = first; + break; + case TGSI_SEMANTIC_COLOR: + fcol = first; + mode = INTERP_PERSPECTIVE; + perspect_load = TRUE; + break; + case TGSI_SEMANTIC_BCOLOR: + bcol = first; + mode = INTERP_PERSPECTIVE; + perspect_load = TRUE; + break; + default: + break; + } + } + + if (d->Declaration.Centroid) { + mode |= INTERP_CENTROID; + centroid_load = TRUE; + perspect_load = FALSE; + } + + assert(last < 32); + for (i = first; i <= last; i++) + interp_mode[i] = mode; + } break; case TGSI_FILE_CONSTANT: if (pc->param_nr < (last + 1)) @@ -1367,6 +1632,43 @@ nv50_program_tx_prep(struct nv50_pc *pc) } break; case TGSI_TOKEN_TYPE_INSTRUCTION: + { + const struct tgsi_full_instruction *insn; + const struct tgsi_full_src_register *src; + const struct tgsi_dst_register *dst; + unsigned mask; + + pc->insn_nr++; + + if (!last_t_use) { + last_t_use = CALLOC(pc->temp_nr * 4, sizeof(unsigned)); + last_a_use = CALLOC(pc->attr_nr * 4, sizeof(unsigned)); + } + + insn = &tok->FullInstruction; + dst = &insn->FullDstRegisters[0].DstRegister; + mask = dst->WriteMask; + +#ifdef NV50_PROGRAM_DUMP + tgsi_dump_instruction(insn, 1); +#endif + if (dst->File == TGSI_FILE_TEMPORARY) { + for (c = 0; c < 4; c++) + if (mask & (1 << c)) + last_t_use[dst->Index * 4 + c] = pc->insn_nr; + } + + for (i = 0; i < insn->Instruction.NumSrcRegs; ++i) { + src = &insn->FullSrcRegisters[i]; + insn_adjust_mask(insn, &mask); + + if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) + set_acc_array(last_t_use, src, mask, pc->insn_nr); + else + if (src->SrcRegister.File == TGSI_FILE_INPUT) + set_acc_array(last_a_use, src, mask, pc->insn_nr); + } + } break; default: break; @@ -1487,6 +1789,11 @@ nv50_program_tx_prep(struct nv50_pc *pc) } } + if (last_t_use) + FREE(last_t_use); + if (last_a_use) + FREE(last_a_use); + ret = TRUE; out_err: tgsi_parse_free(&p); @@ -1516,8 +1823,15 @@ nv50_program_tx(struct nv50_program *p) tgsi_parse_token(&parse); + /* don't allow half insn on first and last (not END) instruction */ + if (pc->insn_cur == 0 || pc->insn_cur + 2 == pc->insn_nr) + pc->allow32 = FALSE; + else + pc->allow32 = TRUE; + switch (tok->Token.Type) { case TGSI_TOKEN_TYPE_INSTRUCTION: + ++pc->insn_cur; ret = nv50_program_tx_insn(pc, tok); if (ret == FALSE) goto out_err; diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h index 78deed6..3b3b6bb 100644 --- a/src/gallium/drivers/nv50/nv50_program.h +++ b/src/gallium/drivers/nv50/nv50_program.h @@ -39,6 +39,11 @@ struct nv50_program { struct { unsigned attr[2]; } vp; + struct { + unsigned regs[4]; + unsigned map[4]; + unsigned high_map; + } fp; } cfg; };