[Mesa-dev] [PATCH] freedreno: a2xx: ir2 update
Rob Clark
robdclark at gmail.com
Fri Aug 10 12:31:05 UTC 2018
()
On Tue, Jul 24, 2018 at 9:00 AM Jonathan Marek <jonathan at marek.ca> wrote:
>
> this patch brings a number of changes to ir2:
> -ir2 now generates CF clauses as necessary during assembly. this simplifies
> fd2_program/fd2_compiler and is necessary to implement optimization passes
> -ir2 now has separate vector/scalar instructions. this will make it easier
> to implementing scheduling of scalar+vector instructions together. dst_reg
> is also now seperate from src registers instead of a single list
> -ir2 now implements register allocation. this makes it possible to compile
> shaders which have more than 64 TGSI registers
> -ir2 now implements the following optimizations: removal of IN/OUT MOV
> instructions generated by TGSI and removal of unused instructions when
> some exports are disabled
> -ir2 now allows full 8-bit index for constants
> -ir2_alloc no longer allocates 4 times too many bytes
>
So, this might be easier to review if it was split up a bit better
into multiple patches.
That said, I think I'll merge it as is, mostly because you folks are
the main ones using and working on a2xx currently, and it isn't
something that would break a3xx+.
However, a few recommendations for the future:
1) you probably want to start running piglit and/or deqp_gles2.
(Piglit has better desktop gl coverage, deqp has better gles coverage.
Not sure whether you care more about gl or gles.) Due to feature
level of a2xx (and because, iirc, I didn't start running piglit much
until a3xx), I guess there will be a lot of skips and fails, but main
thing you want to watch for is tests that transition pass->fail..
piglit-summary.py can compare before/after piglit runs. (Not really
sure how to do that best with deqp, but you can use piglit to run deqp
tests.)
2) shader-db is good for measuring the effect of compiler changes
across a bunch of shaders. Probably worth wiring up shaderdb traces
for a2xx, see dump_shader_info() in ir3_shader.c for example. I
suppose you could use the same format for the traces and re-use
fd-report.py in the shader-db tree to parse before/after results.
(That script could probably use some improvements, like splitting
VS/FS results.. I guess I'll do that next time I work up the courage
to hack on python.)
3) seems like eventually you'll want to stop re-inventing
register_allocate.[ch].. perhaps it is overkill for a2xx, I guess
there wasn't anything complicated like multiple register banks or
conflicting register classes. So maybe this is fine for now.
BR,
-R
> Signed-off-by: Jonathan Marek <jonathan at marek.ca>
> ---
> .../drivers/freedreno/a2xx/fd2_compiler.c | 210 ++---
> .../drivers/freedreno/a2xx/fd2_program.c | 75 +-
> .../drivers/freedreno/a2xx/instr-a2xx.h | 28 +-
> src/gallium/drivers/freedreno/a2xx/ir-a2xx.c | 734 +++++++++++-------
> src/gallium/drivers/freedreno/a2xx/ir-a2xx.h | 113 +--
> 5 files changed, 615 insertions(+), 545 deletions(-)
>
> diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> index 3ad47f9850..12f9a1ce0a 100644
> --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> +++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> @@ -93,9 +93,6 @@ struct fd2_compile_context {
> unsigned position, psize;
>
> uint64_t need_sync;
> -
> - /* current exec CF instruction */
> - struct ir2_cf *cf;
> };
>
> static int
> @@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
>
> ctx->prog = prog;
> ctx->so = so;
> - ctx->cf = NULL;
> ctx->pred_depth = 0;
>
> ret = tgsi_parse_init(&ctx->parser, so->tokens);
> @@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx)
> tgsi_parse_free(&ctx->parser);
> }
>
> -static struct ir2_cf *
> -next_exec_cf(struct fd2_compile_context *ctx)
> -{
> - struct ir2_cf *cf = ctx->cf;
> - if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
> - ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
> - return cf;
> -}
> -
> static void
> compile_vtx_fetch(struct fd2_compile_context *ctx)
> {
> @@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
> int i;
> for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
> struct ir2_instruction *instr = ir2_instr_create(
> - next_exec_cf(ctx), IR2_FETCH);
> + ctx->so->ir, IR2_FETCH);
> instr->fetch.opc = VTX_FETCH;
>
> ctx->need_sync |= 1 << (i+1);
>
> - ir2_reg_create(instr, i+1, "xyzw", 0);
> - ir2_reg_create(instr, 0, "x", 0);
> + ir2_dst_create(instr, i+1, "xyzw", 0);
> + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> if (i == 0)
> instr->sync = true;
> @@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
> vfetch_instrs[i] = instr;
> }
> ctx->so->num_vfetch_instrs = i;
> - ctx->cf = NULL;
> }
>
> /*
> @@ -312,7 +298,7 @@ get_temp_gpr(struct fd2_compile_context *ctx, int idx)
> return num;
> }
>
> -static struct ir2_register *
> +static struct ir2_dst_register *
> add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
> const struct tgsi_dst_register *dst)
> {
> @@ -351,10 +337,10 @@ add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
> swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
> swiz[4] = '\0';
>
> - return ir2_reg_create(alu, num, swiz, flags);
> + return ir2_dst_create(alu, num, swiz, flags);
> }
>
> -static struct ir2_register *
> +static struct ir2_src_register *
> add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
> const struct tgsi_src_register *src)
> {
> @@ -373,6 +359,7 @@ add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
> if (ctx->type == PIPE_SHADER_VERTEX) {
> num = src->Index + 1;
> } else {
> + flags |= IR2_REG_INPUT;
> num = export_linkage(ctx,
> ctx->input_export_idx[src->Index]);
> }
> @@ -415,7 +402,7 @@ static void
> add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
> {
> if (inst->Instruction.Saturate) {
> - alu->alu.vector_clamp = true;
> + alu->alu_vector.clamp = true;
> }
> }
>
> @@ -423,7 +410,7 @@ static void
> add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
> {
> if (inst->Instruction.Saturate) {
> - alu->alu.scalar_clamp = true;
> + alu->alu_scalar.clamp = true;
> }
> }
>
> @@ -461,27 +448,12 @@ add_regs_vector_3(struct fd2_compile_context *ctx,
> assert(inst->Instruction.NumDstRegs == 1);
>
> add_dst_reg(ctx, alu, &inst->Dst[0].Register);
> - /* maybe should re-arrange the syntax some day, but
> - * in assembler/disassembler and what ir.c expects
> - * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
> - */
> - add_src_reg(ctx, alu, &inst->Src[2].Register);
> add_src_reg(ctx, alu, &inst->Src[0].Register);
> add_src_reg(ctx, alu, &inst->Src[1].Register);
> + add_src_reg(ctx, alu, &inst->Src[2].Register);
> add_vector_clamp(inst, alu);
> }
>
> -static void
> -add_regs_dummy_vector(struct ir2_instruction *alu)
> -{
> - /* create dummy, non-written vector dst/src regs
> - * for unused vector instr slot:
> - */
> - ir2_reg_create(alu, 0, "____", 0); /* vector dst */
> - ir2_reg_create(alu, 0, NULL, 0); /* vector src1 */
> - ir2_reg_create(alu, 0, NULL, 0); /* vector src2 */
> -}
> -
> static void
> add_regs_scalar_1(struct fd2_compile_context *ctx,
> struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
> @@ -489,8 +461,6 @@ add_regs_scalar_1(struct fd2_compile_context *ctx,
> assert(inst->Instruction.NumSrcRegs == 1);
> assert(inst->Instruction.NumDstRegs == 1);
>
> - add_regs_dummy_vector(alu);
> -
> add_dst_reg(ctx, alu, &inst->Dst[0].Register);
> add_src_reg(ctx, alu, &inst->Src[0].Register);
> add_scalar_clamp(inst, alu);
> @@ -567,19 +537,13 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
> struct ir2_instruction *alu;
> struct tgsi_dst_register pred_dst;
>
> - /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
> - * themselves:
> - */
> - ctx->cf = NULL;
> -
> if (ctx->pred_depth == 0) {
> /* assign predicate register: */
> ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
>
> get_predicate(ctx, &pred_dst, NULL);
>
> - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SETNEs);
> - add_regs_dummy_vector(alu);
> + alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SETNEs);
> add_dst_reg(ctx, alu, &pred_dst);
> add_src_reg(ctx, alu, src);
> } else {
> @@ -587,7 +551,7 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
>
> get_predicate(ctx, &pred_dst, &pred_src);
>
> - alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> + alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
> add_dst_reg(ctx, alu, &pred_dst);
> add_src_reg(ctx, alu, &pred_src);
> add_src_reg(ctx, alu, src);
> @@ -600,18 +564,11 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
>
> /* save previous pred state to restore in pop_predicate(): */
> ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
> -
> - ctx->cf = NULL;
> }
>
> static void
> pop_predicate(struct fd2_compile_context *ctx)
> {
> - /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
> - * themselves:
> - */
> - ctx->cf = NULL;
> -
> /* restore previous predicate state: */
> ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
>
> @@ -622,8 +579,7 @@ pop_predicate(struct fd2_compile_context *ctx)
>
> get_predicate(ctx, &pred_dst, &pred_src);
>
> - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SET_POPs);
> - add_regs_dummy_vector(alu);
> + alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SET_POPs);
> add_dst_reg(ctx, alu, &pred_dst);
> add_src_reg(ctx, alu, &pred_src);
> alu->pred = IR2_PRED_NONE;
> @@ -631,8 +587,6 @@ pop_predicate(struct fd2_compile_context *ctx)
> /* predicate register no longer needed: */
> ctx->pred_reg = -1;
> }
> -
> - ctx->cf = NULL;
> }
>
> static void
> @@ -693,12 +647,11 @@ translate_pow(struct fd2_compile_context *ctx,
>
> get_internal_temp(ctx, &tmp_dst, &tmp_src);
>
> - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP);
> - add_regs_dummy_vector(alu);
> + alu = ir2_instr_create_alu_s(ctx->so->ir, LOG_CLAMP);
> add_dst_reg(ctx, alu, &tmp_dst);
> add_src_reg(ctx, alu, &inst->Src[0].Register);
>
> - alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> + alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
> add_dst_reg(ctx, alu, &tmp_dst);
> add_src_reg(ctx, alu, &tmp_src);
> add_src_reg(ctx, alu, &inst->Src[1].Register);
> @@ -725,8 +678,7 @@ translate_pow(struct fd2_compile_context *ctx,
> break;
> }
>
> - alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE);
> - add_regs_dummy_vector(alu);
> + alu = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
> add_dst_reg(ctx, alu, &inst->Dst[0].Register);
> add_src_reg(ctx, alu, &tmp_src);
> add_scalar_clamp(inst, alu);
> @@ -737,7 +689,7 @@ translate_tex(struct fd2_compile_context *ctx,
> struct tgsi_full_instruction *inst, unsigned opc)
> {
> struct ir2_instruction *instr;
> - struct ir2_register *reg;
> + struct ir2_src_register *reg;
> struct tgsi_dst_register tmp_dst;
> struct tgsi_src_register tmp_src;
> const struct tgsi_src_register *coord;
> @@ -766,19 +718,18 @@ translate_tex(struct fd2_compile_context *ctx,
> *
> * dst = texture_sample(unit, coord, bias)
> */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, RECIP_IEEE);
>
> - /* MAXv: */
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
> add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
> add_src_reg(ctx, instr, &inst->Src[0].Register);
> add_src_reg(ctx, instr, &inst->Src[0].Register);
>
> - /* RECIP_IEEE: */
> + instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
> add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
> add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle =
> swiz[inst->Src[0].Register.SwizzleW];
>
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
> add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
> add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
> add_src_reg(ctx, instr, &inst->Src[0].Register);
> @@ -788,7 +739,7 @@ translate_tex(struct fd2_compile_context *ctx,
> coord = &inst->Src[0].Register;
> }
>
> - instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH);
> + instr = ir2_instr_create(ctx->so->ir, IR2_FETCH);
> instr->fetch.opc = TEX_FETCH;
> instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
> instr->fetch.is_rect = (inst->Texture.Texture == TGSI_TEXTURE_RECT);
> @@ -807,7 +758,7 @@ translate_tex(struct fd2_compile_context *ctx,
> reg->swizzle[2] = reg->swizzle[0];
>
> /* dst register needs to be marked for sync: */
> - ctx->need_sync |= 1 << instr->regs[0]->num;
> + ctx->need_sync |= 1 << instr->dst_reg.num;
>
> /* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
> instr->sync = true;
> @@ -818,7 +769,7 @@ translate_tex(struct fd2_compile_context *ctx,
> * the texture to a temp and the use ALU instruction to move
> * to output
> */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
>
> add_dst_reg(ctx, instr, &inst->Dst[0].Register);
> add_src_reg(ctx, instr, &tmp_src);
> @@ -869,22 +820,18 @@ translate_sge_slt_seq_sne(struct fd2_compile_context *ctx,
>
> get_internal_temp(ctx, &tmp_dst, &tmp_src);
>
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
> add_dst_reg(ctx, instr, &tmp_dst);
> add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
> add_src_reg(ctx, instr, &inst->Src[1].Register);
>
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), vopc, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, vopc);
> add_dst_reg(ctx, instr, &inst->Dst[0].Register);
> - /* maybe should re-arrange the syntax some day, but
> - * in assembler/disassembler and what ir.c expects
> - * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
> - */
> - get_immediate(ctx, &tmp_const, fui(c0));
> - add_src_reg(ctx, instr, &tmp_const);
> add_src_reg(ctx, instr, &tmp_src);
> get_immediate(ctx, &tmp_const, fui(c1));
> add_src_reg(ctx, instr, &tmp_const);
> + get_immediate(ctx, &tmp_const, fui(c0));
> + add_src_reg(ctx, instr, &tmp_const);
> }
>
> /* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
> @@ -904,25 +851,25 @@ translate_lrp(struct fd2_compile_context *ctx,
> get_immediate(ctx, &tmp_const, fui(1.0));
>
> /* tmp1 = (a * b) */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
> add_dst_reg(ctx, instr, &tmp_dst1);
> add_src_reg(ctx, instr, &inst->Src[0].Register);
> add_src_reg(ctx, instr, &inst->Src[1].Register);
>
> /* tmp2 = (1 - a) */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
> add_dst_reg(ctx, instr, &tmp_dst2);
> add_src_reg(ctx, instr, &tmp_const);
> add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
>
> /* tmp2 = tmp2 * c */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
> add_dst_reg(ctx, instr, &tmp_dst2);
> add_src_reg(ctx, instr, &tmp_src2);
> add_src_reg(ctx, instr, &inst->Src[2].Register);
>
> /* dst = tmp1 + tmp2 */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
> add_dst_reg(ctx, instr, &inst->Dst[0].Register);
> add_src_reg(ctx, instr, &tmp_src1);
> add_src_reg(ctx, instr, &tmp_src2);
> @@ -956,33 +903,28 @@ translate_trig(struct fd2_compile_context *ctx,
> tmp_src.SwizzleX = tmp_src.SwizzleY =
> tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
>
> - /* maybe should re-arrange the syntax some day, but
> - * in assembler/disassembler and what ir.c expects
> - * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
> - */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
> add_dst_reg(ctx, instr, &tmp_dst);
> - get_immediate(ctx, &tmp_const, fui(0.5));
> - add_src_reg(ctx, instr, &tmp_const);
> add_src_reg(ctx, instr, &inst->Src[0].Register);
> get_immediate(ctx, &tmp_const, fui(0.159155));
> add_src_reg(ctx, instr, &tmp_const);
> + get_immediate(ctx, &tmp_const, fui(0.5));
> + add_src_reg(ctx, instr, &tmp_const);
>
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
> add_dst_reg(ctx, instr, &tmp_dst);
> add_src_reg(ctx, instr, &tmp_src);
> add_src_reg(ctx, instr, &tmp_src);
>
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
> add_dst_reg(ctx, instr, &tmp_dst);
> - get_immediate(ctx, &tmp_const, fui(-3.141593));
> - add_src_reg(ctx, instr, &tmp_const);
> add_src_reg(ctx, instr, &tmp_src);
> get_immediate(ctx, &tmp_const, fui(6.283185));
> add_src_reg(ctx, instr, &tmp_const);
> + get_immediate(ctx, &tmp_const, fui(-3.141593));
> + add_src_reg(ctx, instr, &tmp_const);
>
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op);
> - add_regs_dummy_vector(instr);
> + instr = ir2_instr_create_alu_s(ctx->so->ir, op);
> add_dst_reg(ctx, instr, &inst->Dst[0].Register);
> add_src_reg(ctx, instr, &tmp_src);
> }
> @@ -996,12 +938,12 @@ translate_dp2(struct fd2_compile_context *ctx,
> struct ir2_instruction *instr;
> /* DP2ADD c,a,b -> dot2(a,b) + c */
> /* for c we use the constant 0.0 */
> - instr = ir2_instr_create_alu(next_exec_cf(ctx), DOT2ADDv, ~0);
> - get_immediate(ctx, &tmp_const, fui(0.0f));
> + instr = ir2_instr_create_alu_v(ctx->so->ir, DOT2ADDv);
> add_dst_reg(ctx, instr, &inst->Dst[0].Register);
> - add_src_reg(ctx, instr, &tmp_const);
> add_src_reg(ctx, instr, &inst->Src[0].Register);
> add_src_reg(ctx, instr, &inst->Src[1].Register);
> + get_immediate(ctx, &tmp_const, fui(0.0f));
> + add_src_reg(ctx, instr, &tmp_const);
> add_vector_clamp(inst, instr);
> }
>
> @@ -1015,80 +957,53 @@ translate_instruction(struct fd2_compile_context *ctx,
> {
> unsigned opc = inst->Instruction.Opcode;
> struct ir2_instruction *instr;
> - static struct ir2_cf *cf;
>
> if (opc == TGSI_OPCODE_END)
> return;
>
> - if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
> - unsigned num = inst->Dst[0].Register.Index;
> - /* seems like we need to ensure that position vs param/pixel
> - * exports don't end up in the same EXEC clause.. easy way
> - * to do this is force a new EXEC clause on first appearance
> - * of an position or param/pixel export.
> - */
> - if ((num == ctx->position) || (num == ctx->psize)) {
> - if (ctx->num_position > 0) {
> - ctx->cf = NULL;
> - ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION,
> - ctx->num_position - 1);
> - ctx->num_position = 0;
> - }
> - } else {
> - if (ctx->num_param > 0) {
> - ctx->cf = NULL;
> - ir2_cf_create_alloc(ctx->so->ir, SQ_PARAMETER_PIXEL,
> - ctx->num_param - 1);
> - ctx->num_param = 0;
> - }
> - }
> - }
> -
> - cf = next_exec_cf(ctx);
> -
> /* TODO turn this into a table: */
> switch (opc) {
> case TGSI_OPCODE_MOV:
> - instr = ir2_instr_create_alu(cf, MAXv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
> add_regs_vector_1(ctx, inst, instr);
> break;
> case TGSI_OPCODE_RCP:
> - instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE);
> + instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
> add_regs_scalar_1(ctx, inst, instr);
> break;
> case TGSI_OPCODE_RSQ:
> - instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE);
> + instr = ir2_instr_create_alu_s(ctx->so->ir, RECIPSQ_IEEE);
> add_regs_scalar_1(ctx, inst, instr);
> break;
> case TGSI_OPCODE_SQRT:
> - instr = ir2_instr_create_alu(cf, ~0, SQRT_IEEE);
> + instr = ir2_instr_create_alu_s(ctx->so->ir, SQRT_IEEE);
> add_regs_scalar_1(ctx, inst, instr);
> break;
> case TGSI_OPCODE_MUL:
> - instr = ir2_instr_create_alu(cf, MULv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
> add_regs_vector_2(ctx, inst, instr);
> break;
> case TGSI_OPCODE_ADD:
> - instr = ir2_instr_create_alu(cf, ADDv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
> add_regs_vector_2(ctx, inst, instr);
> break;
> case TGSI_OPCODE_DP2:
> translate_dp2(ctx, inst, opc);
> break;
> case TGSI_OPCODE_DP3:
> - instr = ir2_instr_create_alu(cf, DOT3v, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, DOT3v);
> add_regs_vector_2(ctx, inst, instr);
> break;
> case TGSI_OPCODE_DP4:
> - instr = ir2_instr_create_alu(cf, DOT4v, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, DOT4v);
> add_regs_vector_2(ctx, inst, instr);
> break;
> case TGSI_OPCODE_MIN:
> - instr = ir2_instr_create_alu(cf, MINv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MINv);
> add_regs_vector_2(ctx, inst, instr);
> break;
> case TGSI_OPCODE_MAX:
> - instr = ir2_instr_create_alu(cf, MAXv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
> add_regs_vector_2(ctx, inst, instr);
> break;
> case TGSI_OPCODE_SLT:
> @@ -1098,22 +1013,22 @@ translate_instruction(struct fd2_compile_context *ctx,
> translate_sge_slt_seq_sne(ctx, inst, opc);
> break;
> case TGSI_OPCODE_MAD:
> - instr = ir2_instr_create_alu(cf, MULADDv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
> add_regs_vector_3(ctx, inst, instr);
> break;
> case TGSI_OPCODE_LRP:
> translate_lrp(ctx, inst, opc);
> break;
> case TGSI_OPCODE_FRC:
> - instr = ir2_instr_create_alu(cf, FRACv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
> add_regs_vector_1(ctx, inst, instr);
> break;
> case TGSI_OPCODE_FLR:
> - instr = ir2_instr_create_alu(cf, FLOORv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, FLOORv);
> add_regs_vector_1(ctx, inst, instr);
> break;
> case TGSI_OPCODE_EX2:
> - instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE);
> + instr = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
> add_regs_scalar_1(ctx, inst, instr);
> break;
> case TGSI_OPCODE_POW:
> @@ -1128,10 +1043,9 @@ translate_instruction(struct fd2_compile_context *ctx,
> translate_tex(ctx, inst, opc);
> break;
> case TGSI_OPCODE_CMP:
> - instr = ir2_instr_create_alu(cf, CNDGTEv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, CNDGTEv);
> add_regs_vector_3(ctx, inst, instr);
> - // TODO this should be src0 if regs where in sane order..
> - instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */
> + instr->src_reg[0].flags ^= IR2_REG_NEGATE; /* src1 */
> break;
> case TGSI_OPCODE_IF:
> push_predicate(ctx, &inst->Src[0].Register);
> @@ -1139,16 +1053,12 @@ translate_instruction(struct fd2_compile_context *ctx,
> break;
> case TGSI_OPCODE_ELSE:
> ctx->so->ir->pred = IR2_PRED_NE;
> - /* not sure if this is required in all cases, but blob compiler
> - * won't combine EQ and NE in same CF:
> - */
> - ctx->cf = NULL;
> break;
> case TGSI_OPCODE_ENDIF:
> pop_predicate(ctx);
> break;
> case TGSI_OPCODE_F2I:
> - instr = ir2_instr_create_alu(cf, TRUNCv, ~0);
> + instr = ir2_instr_create_alu_v(ctx->so->ir, TRUNCv);
> add_regs_vector_1(ctx, inst, instr);
> break;
> default:
> @@ -1179,8 +1089,6 @@ compile_instructions(struct fd2_compile_context *ctx)
> break;
> }
> }
> -
> - ctx->cf->cf_type = EXEC_END;
> }
>
> int
> diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
> index 834a7c7fcd..34622eaba0 100644
> --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
> +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
> @@ -199,7 +199,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
> instr->fetch.offset = elem->src_offset;
>
> for (j = 0; j < 4; j++)
> - instr->regs[0]->swizzle[j] = "xyzw01__"[desc->swizzle[j]];
> + instr->dst_reg.swizzle[j] = "xyzw01__"[desc->swizzle[j]];
>
> assert(instr->fetch.fmt != ~0);
>
> @@ -210,7 +210,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
> instr->fetch.const_idx,
> instr->fetch.const_idx_sel,
> elem->instance_divisor,
> - instr->regs[0]->swizzle,
> + instr->dst_reg.swizzle,
> instr->fetch.stride,
> instr->fetch.offset);
> }
> @@ -307,7 +307,6 @@ static struct fd2_shader_stateobj *
> create_blit_fp(void)
> {
> struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
> - struct ir2_cf *cf;
> struct ir2_instruction *instr;
>
> if (!so)
> @@ -315,18 +314,13 @@ create_blit_fp(void)
>
> so->ir = ir2_shader_create();
>
> - cf = ir2_cf_create(so->ir, EXEC);
> -
> - instr = ir2_instr_create_tex_fetch(cf, 0);
> - ir2_reg_create(instr, 0, "xyzw", 0);
> - ir2_reg_create(instr, 0, "xyx", 0);
> + instr = ir2_instr_create_tex_fetch(so->ir, 0);
> + ir2_dst_create(instr, 0, "xyzw", 0);
> + ir2_reg_create(instr, 0, "xyx", IR2_REG_INPUT);
> instr->sync = true;
>
> - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> - cf = ir2_cf_create(so->ir, EXEC_END);
> -
> - instr = ir2_instr_create_alu(cf, MAXv, ~0);
> - ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
> + instr = ir2_instr_create_alu_v(so->ir, MAXv);
> + ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
> ir2_reg_create(instr, 0, NULL, 0);
> ir2_reg_create(instr, 0, NULL, 0);
>
> @@ -349,7 +343,6 @@ static struct fd2_shader_stateobj *
> create_blit_vp(void)
> {
> struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
> - struct ir2_cf *cf;
> struct ir2_instruction *instr;
>
> if (!so)
> @@ -357,31 +350,23 @@ create_blit_vp(void)
>
> so->ir = ir2_shader_create();
>
> - cf = ir2_cf_create(so->ir, EXEC);
> -
> - instr = ir2_instr_create_vtx_fetch(cf, 26, 1, FMT_32_32_FLOAT, false, 8);
> + instr = ir2_instr_create_vtx_fetch(so->ir, 26, 1, FMT_32_32_FLOAT, false, 8);
> instr->fetch.is_normalized = true;
> - ir2_reg_create(instr, 1, "xy01", 0);
> - ir2_reg_create(instr, 0, "x", 0);
> + ir2_dst_create(instr, 1, "xy01", 0);
> + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> - instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, false, 12);
> + instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
> instr->fetch.is_normalized = true;
> - ir2_reg_create(instr, 2, "xyz1", 0);
> - ir2_reg_create(instr, 0, "x", 0);
> -
> - cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0);
> - cf = ir2_cf_create(so->ir, EXEC);
> + ir2_dst_create(instr, 2, "xyz1", 0);
> + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> - instr = ir2_instr_create_alu(cf, MAXv, ~0);
> - ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT);
> + instr = ir2_instr_create_alu_v(so->ir, MAXv);
> + ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
> ir2_reg_create(instr, 2, NULL, 0);
> ir2_reg_create(instr, 2, NULL, 0);
>
> - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> - cf = ir2_cf_create(so->ir, EXEC_END);
> -
> - instr = ir2_instr_create_alu(cf, MAXv, ~0);
> - ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
> + instr = ir2_instr_create_alu_v(so->ir, MAXv);
> + ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
> ir2_reg_create(instr, 1, NULL, 0);
> ir2_reg_create(instr, 1, NULL, 0);
>
> @@ -397,7 +382,6 @@ static struct fd2_shader_stateobj *
> create_solid_fp(void)
> {
> struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
> - struct ir2_cf *cf;
> struct ir2_instruction *instr;
>
> if (!so)
> @@ -405,11 +389,8 @@ create_solid_fp(void)
>
> so->ir = ir2_shader_create();
>
> - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> - cf = ir2_cf_create(so->ir, EXEC_END);
> -
> - instr = ir2_instr_create_alu(cf, MAXv, ~0);
> - ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
> + instr = ir2_instr_create_alu_v(so->ir, MAXv);
> + ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
> ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
> ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
>
> @@ -430,7 +411,6 @@ static struct fd2_shader_stateobj *
> create_solid_vp(void)
> {
> struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
> - struct ir2_cf *cf;
> struct ir2_instruction *instr;
>
> if (!so)
> @@ -438,22 +418,15 @@ create_solid_vp(void)
>
> so->ir = ir2_shader_create();
>
> - cf = ir2_cf_create(so->ir, EXEC);
> -
> - instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, false, 12);
> - ir2_reg_create(instr, 1, "xyz1", 0);
> - ir2_reg_create(instr, 0, "x", 0);
> -
> - cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0);
> - cf = ir2_cf_create(so->ir, EXEC);
> + instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
> + ir2_dst_create(instr, 1, "xyz1", 0);
> + ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> - instr = ir2_instr_create_alu(cf, MAXv, ~0);
> - ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT);
> + instr = ir2_instr_create_alu_v(so->ir, MAXv);
> + ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
> ir2_reg_create(instr, 1, NULL, 0);
> ir2_reg_create(instr, 1, NULL, 0);
>
> - cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> - cf = ir2_cf_create(so->ir, EXEC_END);
>
> return assemble(so);
> }
> diff --git a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
> index ac972ed35a..5a9f93ec79 100644
> --- a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
> +++ b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
> @@ -147,15 +147,25 @@ typedef struct PACKED {
> uint8_t const_1_rel_abs : 1;
> uint8_t const_0_rel_abs : 1;
> /* dword2: */
> - uint8_t src3_reg : 6;
> - uint8_t src3_reg_select : 1;
> - uint8_t src3_reg_abs : 1;
> - uint8_t src2_reg : 6;
> - uint8_t src2_reg_select : 1;
> - uint8_t src2_reg_abs : 1;
> - uint8_t src1_reg : 6;
> - uint8_t src1_reg_select : 1;
> - uint8_t src1_reg_abs : 1;
> + union {
> + struct {
> + uint8_t src3_reg : 6;
> + uint8_t src3_reg_select : 1;
> + uint8_t src3_reg_abs : 1;
> + uint8_t src2_reg : 6;
> + uint8_t src2_reg_select : 1;
> + uint8_t src2_reg_abs : 1;
> + uint8_t src1_reg : 6;
> + uint8_t src1_reg_select : 1;
> + uint8_t src1_reg_abs : 1;
> + };
> + /* constants have full 8-bit index */
> + struct {
> + uint8_t src3_reg_const : 8;
> + uint8_t src2_reg_const : 8;
> + uint8_t src1_reg_const : 8;
> + };
> + };
> instr_vector_opc_t vector_opc : 5;
> uint8_t src3_sel : 1;
> uint8_t src2_sel : 1;
> diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
> index 42a9ab494e..af9811864f 100644
> --- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
> +++ b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
> @@ -35,19 +35,13 @@
> #define WARN_MSG(f, ...) DBG("WARN: "f, ##__VA_ARGS__)
> #define ERROR_MSG(f, ...) DBG("ERROR: "f, ##__VA_ARGS__)
>
> -#define REG_MASK 0x3f
> -
> -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr);
> -
> static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
> uint32_t idx, struct ir2_shader_info *info);
>
> -static void reg_update_stats(struct ir2_register *reg,
> - struct ir2_shader_info *info, bool dest);
> -static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n);
> -static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg);
> -static uint32_t reg_alu_dst_swiz(struct ir2_register *reg);
> -static uint32_t reg_alu_src_swiz(struct ir2_register *reg);
> +static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n);
> +static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg);
> +static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg);
> +static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg);
>
> /* simple allocator to carve allocations out of an up-front allocated heap,
> * so that we can free everything easily in one shot.
> @@ -55,7 +49,7 @@ static uint32_t reg_alu_src_swiz(struct ir2_register *reg);
> static void * ir2_alloc(struct ir2_shader *shader, int sz)
> {
> void *ptr = &shader->heap[shader->heap_idx];
> - shader->heap_idx += align(sz, 4);
> + shader->heap_idx += align(sz, 4) / 4;
> return ptr;
> }
>
> @@ -74,7 +68,9 @@ static char * ir2_strdup(struct ir2_shader *shader, const char *str)
> struct ir2_shader * ir2_shader_create(void)
> {
> DEBUG_MSG("");
> - return calloc(1, sizeof(struct ir2_shader));
> + struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader));
> + shader->max_reg = -1;
> + return shader;
> }
>
> void ir2_shader_destroy(struct ir2_shader *shader)
> @@ -83,189 +79,344 @@ void ir2_shader_destroy(struct ir2_shader *shader)
> free(shader);
> }
>
> -/* resolve addr/cnt/sequence fields in the individual CF's */
> -static int shader_resolve(struct ir2_shader *shader, struct ir2_shader_info *info)
> +/* check if an instruction is a simple MOV
> + */
> +static struct ir2_instruction * simple_mov(struct ir2_instruction *instr,
> + bool output)
> {
> - uint32_t addr;
> - unsigned i;
> - int j;
> -
> - addr = shader->cfs_count / 2;
> - for (i = 0; i < shader->cfs_count; i++) {
> - struct ir2_cf *cf = shader->cfs[i];
> - if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
> - uint32_t sequence = 0;
> -
> - if (cf->exec.addr && (cf->exec.addr != addr))
> - WARN_MSG("invalid addr '%d' at CF %d", cf->exec.addr, i);
> - if (cf->exec.cnt && (cf->exec.cnt != cf->exec.instrs_count))
> - WARN_MSG("invalid cnt '%d' at CF %d", cf->exec.cnt, i);
> -
> - for (j = cf->exec.instrs_count - 1; j >= 0; j--) {
> - struct ir2_instruction *instr = cf->exec.instrs[j];
> - sequence <<= 2;
> - if (instr->instr_type == IR2_FETCH)
> - sequence |= 0x1;
> - if (instr->sync)
> - sequence |= 0x2;
> - }
> + struct ir2_src_register *src_reg = instr->src_reg;
> + struct ir2_dst_register *dst_reg = &instr->dst_reg;
> + struct ir2_register *reg;
> + unsigned i;
> +
> + /* MAXv used for MOV */
> + if (instr->instr_type != IR2_ALU_VECTOR ||
> + instr->alu_vector.opc != MAXv)
> + return NULL;
> +
> + /* non identical srcs */
> + if (src_reg[0].num != src_reg[1].num)
> + return NULL;
> +
> + /* flags */
> + int flags = IR2_REG_NEGATE | IR2_REG_ABS;
> + if (output)
> + flags |= IR2_REG_INPUT | IR2_REG_CONST;
> + if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags))
> + return NULL;
> +
> + /* clamping */
> + if (instr->alu_vector.clamp)
> + return NULL;
> +
> + /* swizzling */
> + for (i = 0; i < 4; i++) {
> + char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i];
> + if (swiz == '_')
> + continue;
> +
> + if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] ||
> + swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i])
> + return NULL;
> + }
> +
> + if (output)
> + reg = &instr->shader->reg[src_reg[0].num];
> + else
> + reg = &instr->shader->reg[dst_reg->num];
> +
> + assert(reg->write_idx >= 0);
> + if (reg->write_idx != reg->write_idx2)
> + return NULL;
> +
> + if (!output)
> + return instr;
> +
> + instr = instr->shader->instr[reg->write_idx];
> + return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr;
> +}
>
> - cf->exec.addr = addr;
> - cf->exec.cnt = cf->exec.instrs_count;
> - cf->exec.sequence = sequence;
> +static int src_to_reg(struct ir2_instruction *instr,
> + struct ir2_src_register *reg)
> +{
> + if (reg->flags & IR2_REG_CONST)
> + return reg->num;
>
> - addr += cf->exec.instrs_count;
> - }
> - }
> + return instr->shader->reg[reg->num].reg;
> +}
> +
> +static int dst_to_reg(struct ir2_instruction *instr,
> + struct ir2_dst_register *reg)
> +{
> + if (reg->flags & IR2_REG_EXPORT)
> + return reg->num;
>
> - info->sizedwords = 3 * addr;
> + return instr->shader->reg[reg->num].reg;
> +}
>
> - return 0;
> +static bool mask_get(uint32_t *mask, unsigned index)
> +{
> + return !!(mask[index / 32] & 1 << index % 32);
> }
>
> -void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info *info)
> +static void mask_set(uint32_t *mask, struct ir2_register *reg, int index)
> {
> - uint32_t i, j;
> - uint32_t *ptr, *dwords = NULL;
> - uint32_t idx = 0;
> - int ret;
> -
> - info->sizedwords = 0;
> - info->max_reg = -1;
> - info->max_input_reg = 0;
> - info->regs_written = 0;
> -
> - /* we need an even # of CF's.. insert a NOP if needed */
> - if (shader->cfs_count != align(shader->cfs_count, 2))
> - ir2_cf_create(shader, NOP);
> -
> - /* first pass, resolve sizes and addresses: */
> - ret = shader_resolve(shader, info);
> - if (ret) {
> - ERROR_MSG("resolve failed: %d", ret);
> - goto fail;
> + if (reg) {
> + unsigned i;
> + for (i = 0; i < ARRAY_SIZE(reg->regmask); i++)
> + mask[i] |= reg->regmask[i];
> }
> + if (index >= 0)
> + mask[index / 32] |= 1 << index % 32;
> +}
>
> - ptr = dwords = calloc(4, info->sizedwords);
> +static bool sets_pred(struct ir2_instruction *instr)
> +{
> + return instr->instr_type == IR2_ALU_SCALAR &&
> + instr->alu_scalar.opc >= PRED_SETEs &&
> + instr->alu_scalar.opc <= PRED_SET_RESTOREs;
> +}
>
> - /* second pass, emit CF program in pairs: */
> - for (i = 0; i < shader->cfs_count; i += 2) {
> - instr_cf_t *cfs = (instr_cf_t *)ptr;
> - ret = cf_emit(shader->cfs[i], &cfs[0]);
> - if (ret) {
> - ERROR_MSG("CF emit failed: %d\n", ret);
> - goto fail;
> +
> +
> +void* ir2_shader_assemble(struct ir2_shader *shader,
> + struct ir2_shader_info *info)
> +{
> + /* NOTES
> + * blob compiler seems to always puts PRED_* instrs in a CF by
> + * themselves, and wont combine EQ/NE in the same CF
> + * (not doing this - doesn't seem to make a difference)
> + *
> + * TODO: implement scheduling for combining vector+scalar instructions
> + * -some vector instructions can be replaced by scalar
> + */
> +
> + /* first step:
> + * 1. remove "NOP" MOV instructions generated by TGSI for input/output:
> + * 2. track information for register allocation, and to remove
> + * the dead code when some exports are not needed
> + * 3. add additional instructions for a20x hw binning if needed
> + * NOTE: modifies the shader instrs
> + * this step could be done as instructions are added by compiler instead
> + */
> +
> + /* mask of exports that must be generated
> + * used to avoid calculating ps exports with hw binning
> + */
> + uint64_t export = ~0ull;
> + /* bitmask of variables required for exports defined by "export" */
> + uint32_t export_mask[REG_MASK/32+1] = {};
> +
> + unsigned idx, reg_idx;
> + unsigned max_input = 0;
> + int export_size = -1;
> +
> + for (idx = 0; idx < shader->instr_count; idx++) {
> + struct ir2_instruction *instr = shader->instr[idx], *prev;
> + struct ir2_dst_register dst_reg = instr->dst_reg;
> +
> + if (dst_reg.flags & IR2_REG_EXPORT) {
> + if (dst_reg.num < 32)
> + export_size++;
> +
> + if ((prev = simple_mov(instr, true))) {
> + /* copy instruction but keep dst */
> + *instr = *prev;
> + instr->dst_reg = dst_reg;
> + }
> }
> - ret = cf_emit(shader->cfs[i+1], &cfs[1]);
> - if (ret) {
> - ERROR_MSG("CF emit failed: %d\n", ret);
> - goto fail;
> +
> + for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) {
> + struct ir2_src_register *src_reg = &instr->src_reg[reg_idx];
> + struct ir2_register *reg;
> + int num;
> +
> + if (src_reg->flags & IR2_REG_CONST)
> + continue;
> +
> + num = src_reg->num;
> + reg = &shader->reg[num];
> + reg->read_idx = idx;
> +
> + if (src_reg->flags & IR2_REG_INPUT) {
> + max_input = MAX2(max_input, num);
> + } else {
> + /* bypass simple mov used to set src_reg */
> + assert(reg->write_idx >= 0);
> + prev = shader->instr[reg->write_idx];
> + if (simple_mov(prev, false)) {
> + *src_reg = prev->src_reg[0];
> + /* process same src_reg again */
> + reg_idx -= 1;
> + continue;
> + }
> + }
> +
> + /* update dependencies */
> + uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ?
> + export_mask : shader->reg[dst_reg.num].regmask;
> + mask_set(mask, reg, num);
> + if (sets_pred(instr))
> + mask_set(export_mask, reg, num);
> }
> - ptr += 3;
> - assert((ptr - dwords) <= info->sizedwords);
> }
>
> - /* third pass, emit ALU/FETCH: */
> - for (i = 0; i < shader->cfs_count; i++) {
> - struct ir2_cf *cf = shader->cfs[i];
> - if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
> - for (j = 0; j < cf->exec.instrs_count; j++) {
> - ret = instr_emit(cf->exec.instrs[j], ptr, idx++, info);
> - if (ret) {
> - ERROR_MSG("instruction emit failed: %d", ret);
> - goto fail;
> - }
> - ptr += 3;
> - assert((ptr - dwords) <= info->sizedwords);
> + /* second step:
> + * emit instructions (with CFs) + RA
> + */
> + instr_cf_t cfs[128], *cf = cfs;
> + uint32_t alufetch[3*256], *af = alufetch;
> +
> + /* RA is done on write, so inputs must be allocated here */
> + for (reg_idx = 0; reg_idx <= max_input; reg_idx++)
> + shader->reg[reg_idx].reg = reg_idx;
> + info->max_reg = max_input;
> +
> + /* CF instr state */
> + instr_cf_exec_t exec = { .opc = EXEC };
> + instr_cf_alloc_t alloc = { .opc = ALLOC };
> + bool need_alloc = 0;
> + bool pos_export = 0;
> +
> + export_size = MAX2(export_size, 0);
> +
> + for (idx = 0; idx < shader->instr_count; idx++) {
> + struct ir2_instruction *instr = shader->instr[idx];
> + struct ir2_dst_register *dst_reg = &instr->dst_reg;
> + unsigned num = dst_reg->num;
> + struct ir2_register *reg;
> +
> + /* a2xx only has 64 registers, so we can use a single 64-bit mask */
> + uint64_t regmask = 0ull;
> +
> + /* compute the current regmask */
> + for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) {
> + reg = &shader->reg[reg_idx];
> + if ((int) idx > reg->write_idx && idx < reg->read_idx)
> + regmask |= (1ull << reg->reg);
> + }
> +
> + if (dst_reg->flags & IR2_REG_EXPORT) {
> + /* skip if export is not needed */
> + if (!(export & (1ull << num)))
> + continue;
> +
> + /* ALLOC CF:
> + * want to alloc all < 32 at once
> + * 32/33 and 62/63 come in pairs
> + * XXX assuming all 3 types are never interleaved
> + */
> + if (num < 32) {
> + alloc.size = export_size;
> + alloc.buffer_select = SQ_PARAMETER_PIXEL;
> + need_alloc = export_size >= 0;
> + export_size = -1;
> + } else if (num == 32 || num == 33) {
> + alloc.size = 0;
> + alloc.buffer_select = SQ_MEMORY;
> + need_alloc = num != 33;
> + } else {
> + alloc.size = 0;
> + alloc.buffer_select = SQ_POSITION;
> + need_alloc = !pos_export;
> + pos_export = true;
> }
> +
> + } else {
> + /* skip if dst register not needed to compute exports */
> + if (!mask_get(export_mask, num))
> + continue;
> +
> + /* RA on first write */
> + reg = &shader->reg[num];
> + if (reg->write_idx == idx) {
> + reg->reg = ffsll(~regmask) - 1;
> + info->max_reg = MAX2(info->max_reg, reg->reg);
> + }
> + }
> +
> + if (exec.count == 6 || (exec.count && need_alloc)) {
> + *cf++ = *(instr_cf_t*) &exec;
> + exec.address += exec.count;
> + exec.serialize = 0;
> + exec.count = 0;
> }
> +
> + if (need_alloc) {
> + *cf++ = *(instr_cf_t*) &alloc;
> + need_alloc = false;
> + }
> +
> + int ret = instr_emit(instr, af, idx, info); af += 3;
> + assert(!ret);
> +
> + if (instr->instr_type == IR2_FETCH)
> + exec.serialize |= 0x1 << exec.count * 2;
> + if (instr->sync)
> + exec.serialize |= 0x2 << exec.count * 2;
> + exec.count += 1;
> }
>
> - return dwords;
>
> -fail:
> - free(dwords);
> - return NULL;
> -}
> + exec.opc = !export_size ? EXEC : EXEC_END;
> + *cf++ = *(instr_cf_t*) &exec;
> + exec.address += exec.count;
> + exec.serialize = 0;
> + exec.count = 0;
>
> + /* GPU will hang without at least one pixel alloc */
> + if (!export_size) {
> + alloc.size = 0;
> + alloc.buffer_select = SQ_PARAMETER_PIXEL;
> + *cf++ = *(instr_cf_t*) &alloc;
>
> -struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type)
> -{
> - struct ir2_cf *cf = ir2_alloc(shader, sizeof(struct ir2_cf));
> - DEBUG_MSG("%d", cf_type);
> - cf->shader = shader;
> - cf->cf_type = cf_type;
> - assert(shader->cfs_count < ARRAY_SIZE(shader->cfs));
> - shader->cfs[shader->cfs_count++] = cf;
> - return cf;
> -}
> + exec.opc = EXEC_END;
> + *cf++ = *(instr_cf_t*) &exec;
> + }
>
> + unsigned num_cfs = cf - cfs;
>
> -/*
> - * CF instructions:
> - */
> + /* insert nop to get an even # of CFs */
> + if (num_cfs % 2) {
> + *cf++ = (instr_cf_t) { .opc = NOP };
> + num_cfs++;
> + }
>
> -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr)
> -{
> - memset(instr, 0, sizeof(*instr));
> -
> - instr->opc = cf->cf_type;
> -
> - switch (cf->cf_type) {
> - case NOP:
> - break;
> - case EXEC:
> - case EXEC_END:
> - assert(cf->exec.addr <= 0x1ff);
> - assert(cf->exec.cnt <= 0x6);
> - assert(cf->exec.sequence <= 0xfff);
> - instr->exec.address = cf->exec.addr;
> - instr->exec.count = cf->exec.cnt;
> - instr->exec.serialize = cf->exec.sequence;
> - break;
> - case ALLOC:
> - assert(cf->alloc.size <= 0xf);
> - instr->alloc.size = cf->alloc.size;
> - switch (cf->alloc.type) {
> - case SQ_POSITION:
> - case SQ_PARAMETER_PIXEL:
> - instr->alloc.buffer_select = cf->alloc.type;
> + /* offset cf addrs */
> + for (idx = 0; idx < num_cfs; idx++) {
> + switch (cfs[idx].opc) {
> + case EXEC:
> + case EXEC_END:
> + cfs[idx].exec.address += num_cfs / 2;
> break;
> default:
> - ERROR_MSG("invalid alloc type: %d", cf->alloc.type);
> - return -1;
> + break;
> + /* XXX and any other address using cf that gets implemented */
> }
> - break;
> - case COND_EXEC:
> - case COND_EXEC_END:
> - case COND_PRED_EXEC:
> - case COND_PRED_EXEC_END:
> - case LOOP_START:
> - case LOOP_END:
> - case COND_CALL:
> - case RETURN:
> - case COND_JMP:
> - case COND_EXEC_PRED_CLEAN:
> - case COND_EXEC_PRED_CLEAN_END:
> - case MARK_VS_FETCH_DONE:
> - ERROR_MSG("TODO");
> - return -1;
> }
>
> - return 0;
> + /* concatenate cfs+alufetchs */
> + uint32_t cfdwords = num_cfs / 2 * 3;
> + uint32_t alufetchdwords = exec.address * 3;
> + info->sizedwords = cfdwords + alufetchdwords;
> + uint32_t *dwords = malloc(info->sizedwords * 4);
> + assert(dwords);
> + memcpy(dwords, cfs, cfdwords * 4);
> + memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4);
> + return dwords;
> }
>
> -
> -struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type)
> +struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
> + int instr_type)
> {
> struct ir2_instruction *instr =
> - ir2_alloc(cf->shader, sizeof(struct ir2_instruction));
> + ir2_alloc(shader, sizeof(struct ir2_instruction));
> DEBUG_MSG("%d", instr_type);
> - instr->shader = cf->shader;
> - instr->pred = cf->shader->pred;
> + instr->shader = shader;
> + instr->idx = shader->instr_count;
> + instr->pred = shader->pred;
> instr->instr_type = instr_type;
> - assert(cf->exec.instrs_count < ARRAY_SIZE(cf->exec.instrs));
> - cf->exec.instrs[cf->exec.instrs_count++] = instr;
> + shader->instr[shader->instr_count++] = instr;
> return instr;
> }
>
> @@ -279,15 +430,11 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
> struct ir2_shader_info *info)
> {
> instr_fetch_t *fetch = (instr_fetch_t *)dwords;
> - int reg = 0;
> - struct ir2_register *dst_reg = instr->regs[reg++];
> - struct ir2_register *src_reg = instr->regs[reg++];
> + struct ir2_dst_register *dst_reg = &instr->dst_reg;
> + struct ir2_src_register *src_reg = &instr->src_reg[0];
>
> memset(fetch, 0, sizeof(*fetch));
>
> - reg_update_stats(dst_reg, info, true);
> - reg_update_stats(src_reg, info, false);
> -
> fetch->opc = instr->fetch.opc;
>
> if (instr->fetch.opc == VTX_FETCH) {
> @@ -298,9 +445,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
> assert(instr->fetch.const_idx <= 0x1f);
> assert(instr->fetch.const_idx_sel <= 0x3);
>
> - vtx->src_reg = src_reg->num;
> + vtx->src_reg = src_to_reg(instr, src_reg);
> vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1);
> - vtx->dst_reg = dst_reg->num;
> + vtx->dst_reg = dst_to_reg(instr, dst_reg);
> vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg);
> vtx->must_be_one = 1;
> vtx->const_index = instr->fetch.const_idx;
> @@ -326,9 +473,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
>
>
More information about the mesa-dev
mailing list