[Mesa-dev] [PATCH] freedreno: a2xx: ir2 update

Fri Aug 10 12:31:05 UTC 2018

()
On Tue, Jul 24, 2018 at 9:00 AM Jonathan Marek <jonathan at marek.ca> wrote:
>
> this patch brings a number of changes to ir2:
> -ir2 now generates CF clauses as necessary during assembly. this simplifies
>  fd2_program/fd2_compiler and is necessary to implement optimization passes
> -ir2 now has separate vector/scalar instructions. this will make it easier
>  to implementing scheduling of scalar+vector instructions together. dst_reg
>  is also now seperate from src registers instead of a single list
> -ir2 now implements register allocation. this makes it possible to compile
>  shaders which have more than 64 TGSI registers
> -ir2 now implements the following optimizations: removal of IN/OUT MOV
>  instructions generated by TGSI and removal of unused instructions when
>  some exports are disabled
> -ir2 now allows full 8-bit index for constants
> -ir2_alloc no longer allocates 4 times too many bytes
>

So, this might be easier to review if it was split up a bit better
into multiple patches.

That said, I think I'll merge it as is, mostly because you folks are
the main ones using and working on a2xx currently, and it isn't
something that would break a3xx+.

However, a few recommendations for the future:

1) you probably want to start running piglit and/or deqp_gles2.
(Piglit has better desktop gl coverage, deqp has better gles coverage.
Not sure whether you care more about gl or gles.)  Due to feature
level of a2xx (and because, iirc, I didn't start running piglit much
until a3xx), I guess there will be a lot of skips and fails, but main
thing you want to watch for is tests that transition pass->fail..
piglit-summary.py can compare before/after piglit runs.  (Not really
sure how to do that best with deqp, but you can use piglit to run deqp
tests.)

2) shader-db is good for measuring the effect of compiler changes
across a bunch of shaders.  Probably worth wiring up shaderdb traces
for a2xx, see dump_shader_info() in ir3_shader.c for example.  I
suppose you could use the same format for the traces and re-use
fd-report.py in the shader-db tree to parse before/after results.
(That script could probably use some improvements, like splitting
VS/FS results.. I guess I'll do that next time I work up the courage
to hack on python.)

3) seems like eventually you'll want to stop re-inventing
register_allocate.[ch].. perhaps it is overkill for a2xx, I guess
there wasn't anything complicated like multiple register banks or
conflicting register classes.  So maybe this is fine for now.

BR,
-R

> Signed-off-by: Jonathan Marek <jonathan at marek.ca>
> ---
>  .../drivers/freedreno/a2xx/fd2_compiler.c     | 210 ++---
>  .../drivers/freedreno/a2xx/fd2_program.c      |  75 +-
>  .../drivers/freedreno/a2xx/instr-a2xx.h       |  28 +-
>  src/gallium/drivers/freedreno/a2xx/ir-a2xx.c  | 734 +++++++++++-------
>  src/gallium/drivers/freedreno/a2xx/ir-a2xx.h  | 113 +--
>  5 files changed, 615 insertions(+), 545 deletions(-)
>
> diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> index 3ad47f9850..12f9a1ce0a 100644
> --- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> +++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
> @@ -93,9 +93,6 @@ struct fd2_compile_context {
>         unsigned position, psize;
>
>         uint64_t need_sync;
> -
> -       /* current exec CF instruction */
> -       struct ir2_cf *cf;
>  };
>
>  static int
> @@ -130,7 +127,6 @@ compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
>
>         ctx->prog = prog;
>         ctx->so = so;
> -       ctx->cf = NULL;
>         ctx->pred_depth = 0;
>
>         ret = tgsi_parse_init(&ctx->parser, so->tokens);
> @@ -236,15 +232,6 @@ compile_free(struct fd2_compile_context *ctx)
>         tgsi_parse_free(&ctx->parser);
>  }
>
> -static struct ir2_cf *
> -next_exec_cf(struct fd2_compile_context *ctx)
> -{
> -       struct ir2_cf *cf = ctx->cf;
> -       if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
> -               ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
> -       return cf;
> -}
> -
>  static void
>  compile_vtx_fetch(struct fd2_compile_context *ctx)
>  {
> @@ -252,13 +239,13 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
>         int i;
>         for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
>                 struct ir2_instruction *instr = ir2_instr_create(
> -                               next_exec_cf(ctx), IR2_FETCH);
> +                               ctx->so->ir, IR2_FETCH);
>                 instr->fetch.opc = VTX_FETCH;
>
>                 ctx->need_sync |= 1 << (i+1);
>
> -               ir2_reg_create(instr, i+1, "xyzw", 0);
> -               ir2_reg_create(instr, 0, "x", 0);
> +               ir2_dst_create(instr, i+1, "xyzw", 0);
> +               ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
>                 if (i == 0)
>                         instr->sync = true;
> @@ -266,7 +253,6 @@ compile_vtx_fetch(struct fd2_compile_context *ctx)
>                 vfetch_instrs[i] = instr;
>         }
>         ctx->so->num_vfetch_instrs = i;
> -       ctx->cf = NULL;
>  }
>
>  /*
> @@ -312,7 +298,7 @@ get_temp_gpr(struct fd2_compile_context *ctx, int idx)
>         return num;
>  }
>
> -static struct ir2_register *
> +static struct ir2_dst_register *
>  add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
>                 const struct tgsi_dst_register *dst)
>  {
> @@ -351,10 +337,10 @@ add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
>         swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
>         swiz[4] = '\0';
>
> -       return ir2_reg_create(alu, num, swiz, flags);
> +       return ir2_dst_create(alu, num, swiz, flags);
>  }
>
> -static struct ir2_register *
> +static struct ir2_src_register *
>  add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
>                 const struct tgsi_src_register *src)
>  {
> @@ -373,6 +359,7 @@ add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
>                 if (ctx->type == PIPE_SHADER_VERTEX) {
>                         num = src->Index + 1;
>                 } else {
> +                       flags |= IR2_REG_INPUT;
>                         num = export_linkage(ctx,
>                                         ctx->input_export_idx[src->Index]);
>                 }
> @@ -415,7 +402,7 @@ static void
>  add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
>  {
>         if (inst->Instruction.Saturate) {
> -               alu->alu.vector_clamp = true;
> +               alu->alu_vector.clamp = true;
>         }
>  }
>
> @@ -423,7 +410,7 @@ static void
>  add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
>  {
>         if (inst->Instruction.Saturate) {
> -               alu->alu.scalar_clamp = true;
> +               alu->alu_scalar.clamp = true;
>         }
>  }
>
> @@ -461,27 +448,12 @@ add_regs_vector_3(struct fd2_compile_context *ctx,
>         assert(inst->Instruction.NumDstRegs == 1);
>
>         add_dst_reg(ctx, alu, &inst->Dst[0].Register);
> -       /* maybe should re-arrange the syntax some day, but
> -        * in assembler/disassembler and what ir.c expects
> -        * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
> -        */
> -       add_src_reg(ctx, alu, &inst->Src[2].Register);
>         add_src_reg(ctx, alu, &inst->Src[0].Register);
>         add_src_reg(ctx, alu, &inst->Src[1].Register);
> +       add_src_reg(ctx, alu, &inst->Src[2].Register);
>         add_vector_clamp(inst, alu);
>  }
>
> -static void
> -add_regs_dummy_vector(struct ir2_instruction *alu)
> -{
> -       /* create dummy, non-written vector dst/src regs
> -        * for unused vector instr slot:
> -        */
> -       ir2_reg_create(alu, 0, "____", 0); /* vector dst */
> -       ir2_reg_create(alu, 0, NULL, 0);   /* vector src1 */
> -       ir2_reg_create(alu, 0, NULL, 0);   /* vector src2 */
> -}
> -
>  static void
>  add_regs_scalar_1(struct fd2_compile_context *ctx,
>                 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
> @@ -489,8 +461,6 @@ add_regs_scalar_1(struct fd2_compile_context *ctx,
>         assert(inst->Instruction.NumSrcRegs == 1);
>         assert(inst->Instruction.NumDstRegs == 1);
>
> -       add_regs_dummy_vector(alu);
> -
>         add_dst_reg(ctx, alu, &inst->Dst[0].Register);
>         add_src_reg(ctx, alu, &inst->Src[0].Register);
>         add_scalar_clamp(inst, alu);
> @@ -567,19 +537,13 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
>         struct ir2_instruction *alu;
>         struct tgsi_dst_register pred_dst;
>
> -       /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
> -        * themselves:
> -        */
> -       ctx->cf = NULL;
> -
>         if (ctx->pred_depth == 0) {
>                 /* assign predicate register: */
>                 ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
>
>                 get_predicate(ctx, &pred_dst, NULL);
>
> -               alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SETNEs);
> -               add_regs_dummy_vector(alu);
> +               alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SETNEs);
>                 add_dst_reg(ctx, alu, &pred_dst);
>                 add_src_reg(ctx, alu, src);
>         } else {
> @@ -587,7 +551,7 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
>
>                 get_predicate(ctx, &pred_dst, &pred_src);
>
> -               alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> +               alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
>                 add_dst_reg(ctx, alu, &pred_dst);
>                 add_src_reg(ctx, alu, &pred_src);
>                 add_src_reg(ctx, alu, src);
> @@ -600,18 +564,11 @@ push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
>
>         /* save previous pred state to restore in pop_predicate(): */
>         ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
> -
> -       ctx->cf = NULL;
>  }
>
>  static void
>  pop_predicate(struct fd2_compile_context *ctx)
>  {
> -       /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
> -        * themselves:
> -        */
> -       ctx->cf = NULL;
> -
>         /* restore previous predicate state: */
>         ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
>
> @@ -622,8 +579,7 @@ pop_predicate(struct fd2_compile_context *ctx)
>
>                 get_predicate(ctx, &pred_dst, &pred_src);
>
> -               alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SET_POPs);
> -               add_regs_dummy_vector(alu);
> +               alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SET_POPs);
>                 add_dst_reg(ctx, alu, &pred_dst);
>                 add_src_reg(ctx, alu, &pred_src);
>                 alu->pred = IR2_PRED_NONE;
> @@ -631,8 +587,6 @@ pop_predicate(struct fd2_compile_context *ctx)
>                 /* predicate register no longer needed: */
>                 ctx->pred_reg = -1;
>         }
> -
> -       ctx->cf = NULL;
>  }
>
>  static void
> @@ -693,12 +647,11 @@ translate_pow(struct fd2_compile_context *ctx,
>
>         get_internal_temp(ctx, &tmp_dst, &tmp_src);
>
> -       alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP);
> -       add_regs_dummy_vector(alu);
> +       alu = ir2_instr_create_alu_s(ctx->so->ir, LOG_CLAMP);
>         add_dst_reg(ctx, alu, &tmp_dst);
>         add_src_reg(ctx, alu, &inst->Src[0].Register);
>
> -       alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> +       alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
>         add_dst_reg(ctx, alu, &tmp_dst);
>         add_src_reg(ctx, alu, &tmp_src);
>         add_src_reg(ctx, alu, &inst->Src[1].Register);
> @@ -725,8 +678,7 @@ translate_pow(struct fd2_compile_context *ctx,
>                 break;
>         }
>
> -       alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE);
> -       add_regs_dummy_vector(alu);
> +       alu = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
>         add_dst_reg(ctx, alu, &inst->Dst[0].Register);
>         add_src_reg(ctx, alu, &tmp_src);
>         add_scalar_clamp(inst, alu);
> @@ -737,7 +689,7 @@ translate_tex(struct fd2_compile_context *ctx,
>                 struct tgsi_full_instruction *inst, unsigned opc)
>  {
>         struct ir2_instruction *instr;
> -       struct ir2_register *reg;
> +       struct ir2_src_register *reg;
>         struct tgsi_dst_register tmp_dst;
>         struct tgsi_src_register tmp_src;
>         const struct tgsi_src_register *coord;
> @@ -766,19 +718,18 @@ translate_tex(struct fd2_compile_context *ctx,
>                  *
>                  *  dst = texture_sample(unit, coord, bias)
>                  */
> -               instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, RECIP_IEEE);
>
> -               /* MAXv: */
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
>                 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
>                 add_src_reg(ctx, instr, &inst->Src[0].Register);
>                 add_src_reg(ctx, instr, &inst->Src[0].Register);
>
> -               /* RECIP_IEEE: */
> +               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
>                 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
>                 add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle =
>                                 swiz[inst->Src[0].Register.SwizzleW];
>
> -               instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
>                 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
>                 add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
>                 add_src_reg(ctx, instr, &inst->Src[0].Register);
> @@ -788,7 +739,7 @@ translate_tex(struct fd2_compile_context *ctx,
>                 coord = &inst->Src[0].Register;
>         }
>
> -       instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH);
> +       instr = ir2_instr_create(ctx->so->ir, IR2_FETCH);
>         instr->fetch.opc = TEX_FETCH;
>         instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
>         instr->fetch.is_rect = (inst->Texture.Texture == TGSI_TEXTURE_RECT);
> @@ -807,7 +758,7 @@ translate_tex(struct fd2_compile_context *ctx,
>                 reg->swizzle[2] = reg->swizzle[0];
>
>         /* dst register needs to be marked for sync: */
> -       ctx->need_sync |= 1 << instr->regs[0]->num;
> +       ctx->need_sync |= 1 << instr->dst_reg.num;
>
>         /* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
>         instr->sync = true;
> @@ -818,7 +769,7 @@ translate_tex(struct fd2_compile_context *ctx,
>                  * the texture to a temp and the use ALU instruction to move
>                  * to output
>                  */
> -               instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
>
>                 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
>                 add_src_reg(ctx, instr, &tmp_src);
> @@ -869,22 +820,18 @@ translate_sge_slt_seq_sne(struct fd2_compile_context *ctx,
>
>         get_internal_temp(ctx, &tmp_dst, &tmp_src);
>
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
>         add_dst_reg(ctx, instr, &tmp_dst);
>         add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
>         add_src_reg(ctx, instr, &inst->Src[1].Register);
>
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), vopc, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, vopc);
>         add_dst_reg(ctx, instr, &inst->Dst[0].Register);
> -       /* maybe should re-arrange the syntax some day, but
> -        * in assembler/disassembler and what ir.c expects
> -        * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
> -        */
> -       get_immediate(ctx, &tmp_const, fui(c0));
> -       add_src_reg(ctx, instr, &tmp_const);
>         add_src_reg(ctx, instr, &tmp_src);
>         get_immediate(ctx, &tmp_const, fui(c1));
>         add_src_reg(ctx, instr, &tmp_const);
> +       get_immediate(ctx, &tmp_const, fui(c0));
> +       add_src_reg(ctx, instr, &tmp_const);
>  }
>
>  /* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
> @@ -904,25 +851,25 @@ translate_lrp(struct fd2_compile_context *ctx,
>         get_immediate(ctx, &tmp_const, fui(1.0));
>
>         /* tmp1 = (a * b) */
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
>         add_dst_reg(ctx, instr, &tmp_dst1);
>         add_src_reg(ctx, instr, &inst->Src[0].Register);
>         add_src_reg(ctx, instr, &inst->Src[1].Register);
>
>         /* tmp2 = (1 - a) */
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
>         add_dst_reg(ctx, instr, &tmp_dst2);
>         add_src_reg(ctx, instr, &tmp_const);
>         add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
>
>         /* tmp2 = tmp2 * c */
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
>         add_dst_reg(ctx, instr, &tmp_dst2);
>         add_src_reg(ctx, instr, &tmp_src2);
>         add_src_reg(ctx, instr, &inst->Src[2].Register);
>
>         /* dst = tmp1 + tmp2 */
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
>         add_dst_reg(ctx, instr, &inst->Dst[0].Register);
>         add_src_reg(ctx, instr, &tmp_src1);
>         add_src_reg(ctx, instr, &tmp_src2);
> @@ -956,33 +903,28 @@ translate_trig(struct fd2_compile_context *ctx,
>         tmp_src.SwizzleX = tmp_src.SwizzleY =
>                         tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
>
> -       /* maybe should re-arrange the syntax some day, but
> -        * in assembler/disassembler and what ir.c expects
> -        * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
> -        */
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
>         add_dst_reg(ctx, instr, &tmp_dst);
> -       get_immediate(ctx, &tmp_const, fui(0.5));
> -       add_src_reg(ctx, instr, &tmp_const);
>         add_src_reg(ctx, instr, &inst->Src[0].Register);
>         get_immediate(ctx, &tmp_const, fui(0.159155));
>         add_src_reg(ctx, instr, &tmp_const);
> +       get_immediate(ctx, &tmp_const, fui(0.5));
> +       add_src_reg(ctx, instr, &tmp_const);
>
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
>         add_dst_reg(ctx, instr, &tmp_dst);
>         add_src_reg(ctx, instr, &tmp_src);
>         add_src_reg(ctx, instr, &tmp_src);
>
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
>         add_dst_reg(ctx, instr, &tmp_dst);
> -       get_immediate(ctx, &tmp_const, fui(-3.141593));
> -       add_src_reg(ctx, instr, &tmp_const);
>         add_src_reg(ctx, instr, &tmp_src);
>         get_immediate(ctx, &tmp_const, fui(6.283185));
>         add_src_reg(ctx, instr, &tmp_const);
> +       get_immediate(ctx, &tmp_const, fui(-3.141593));
> +       add_src_reg(ctx, instr, &tmp_const);
>
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op);
> -       add_regs_dummy_vector(instr);
> +       instr = ir2_instr_create_alu_s(ctx->so->ir, op);
>         add_dst_reg(ctx, instr, &inst->Dst[0].Register);
>         add_src_reg(ctx, instr, &tmp_src);
>  }
> @@ -996,12 +938,12 @@ translate_dp2(struct fd2_compile_context *ctx,
>         struct ir2_instruction *instr;
>         /* DP2ADD c,a,b -> dot2(a,b) + c */
>         /* for c we use the constant 0.0 */
> -       instr = ir2_instr_create_alu(next_exec_cf(ctx), DOT2ADDv, ~0);
> -       get_immediate(ctx, &tmp_const, fui(0.0f));
> +       instr = ir2_instr_create_alu_v(ctx->so->ir, DOT2ADDv);
>         add_dst_reg(ctx, instr, &inst->Dst[0].Register);
> -       add_src_reg(ctx, instr, &tmp_const);
>         add_src_reg(ctx, instr, &inst->Src[0].Register);
>         add_src_reg(ctx, instr, &inst->Src[1].Register);
> +       get_immediate(ctx, &tmp_const, fui(0.0f));
> +       add_src_reg(ctx, instr, &tmp_const);
>         add_vector_clamp(inst, instr);
>  }
>
> @@ -1015,80 +957,53 @@ translate_instruction(struct fd2_compile_context *ctx,
>  {
>         unsigned opc = inst->Instruction.Opcode;
>         struct ir2_instruction *instr;
> -       static struct ir2_cf *cf;
>
>         if (opc == TGSI_OPCODE_END)
>                 return;
>
> -       if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
> -               unsigned num = inst->Dst[0].Register.Index;
> -               /* seems like we need to ensure that position vs param/pixel
> -                * exports don't end up in the same EXEC clause..  easy way
> -                * to do this is force a new EXEC clause on first appearance
> -                * of an position or param/pixel export.
> -                */
> -               if ((num == ctx->position) || (num == ctx->psize)) {
> -                       if (ctx->num_position > 0) {
> -                               ctx->cf = NULL;
> -                               ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION,
> -                                               ctx->num_position - 1);
> -                               ctx->num_position = 0;
> -                       }
> -               } else {
> -                       if (ctx->num_param > 0) {
> -                               ctx->cf = NULL;
> -                               ir2_cf_create_alloc(ctx->so->ir, SQ_PARAMETER_PIXEL,
> -                                               ctx->num_param - 1);
> -                               ctx->num_param = 0;
> -                       }
> -               }
> -       }
> -
> -       cf = next_exec_cf(ctx);
> -
>         /* TODO turn this into a table: */
>         switch (opc) {
>         case TGSI_OPCODE_MOV:
> -               instr = ir2_instr_create_alu(cf, MAXv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
>                 add_regs_vector_1(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_RCP:
> -               instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE);
> +               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
>                 add_regs_scalar_1(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_RSQ:
> -               instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE);
> +               instr = ir2_instr_create_alu_s(ctx->so->ir, RECIPSQ_IEEE);
>                 add_regs_scalar_1(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_SQRT:
> -               instr = ir2_instr_create_alu(cf, ~0, SQRT_IEEE);
> +               instr = ir2_instr_create_alu_s(ctx->so->ir, SQRT_IEEE);
>                 add_regs_scalar_1(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_MUL:
> -               instr = ir2_instr_create_alu(cf, MULv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
>                 add_regs_vector_2(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_ADD:
> -               instr = ir2_instr_create_alu(cf, ADDv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
>                 add_regs_vector_2(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_DP2:
>                 translate_dp2(ctx, inst, opc);
>                 break;
>         case TGSI_OPCODE_DP3:
> -               instr = ir2_instr_create_alu(cf, DOT3v, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, DOT3v);
>                 add_regs_vector_2(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_DP4:
> -               instr = ir2_instr_create_alu(cf, DOT4v, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, DOT4v);
>                 add_regs_vector_2(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_MIN:
> -               instr = ir2_instr_create_alu(cf, MINv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MINv);
>                 add_regs_vector_2(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_MAX:
> -               instr = ir2_instr_create_alu(cf, MAXv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
>                 add_regs_vector_2(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_SLT:
> @@ -1098,22 +1013,22 @@ translate_instruction(struct fd2_compile_context *ctx,
>                 translate_sge_slt_seq_sne(ctx, inst, opc);
>                 break;
>         case TGSI_OPCODE_MAD:
> -               instr = ir2_instr_create_alu(cf, MULADDv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
>                 add_regs_vector_3(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_LRP:
>                 translate_lrp(ctx, inst, opc);
>                 break;
>         case TGSI_OPCODE_FRC:
> -               instr = ir2_instr_create_alu(cf, FRACv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
>                 add_regs_vector_1(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_FLR:
> -               instr = ir2_instr_create_alu(cf, FLOORv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, FLOORv);
>                 add_regs_vector_1(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_EX2:
> -               instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE);
> +               instr = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
>                 add_regs_scalar_1(ctx, inst, instr);
>                 break;
>         case TGSI_OPCODE_POW:
> @@ -1128,10 +1043,9 @@ translate_instruction(struct fd2_compile_context *ctx,
>                 translate_tex(ctx, inst, opc);
>                 break;
>         case TGSI_OPCODE_CMP:
> -               instr = ir2_instr_create_alu(cf, CNDGTEv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, CNDGTEv);
>                 add_regs_vector_3(ctx, inst, instr);
> -               // TODO this should be src0 if regs where in sane order..
> -               instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */
> +               instr->src_reg[0].flags ^= IR2_REG_NEGATE; /* src1 */
>                 break;
>         case TGSI_OPCODE_IF:
>                 push_predicate(ctx, &inst->Src[0].Register);
> @@ -1139,16 +1053,12 @@ translate_instruction(struct fd2_compile_context *ctx,
>                 break;
>         case TGSI_OPCODE_ELSE:
>                 ctx->so->ir->pred = IR2_PRED_NE;
> -               /* not sure if this is required in all cases, but blob compiler
> -                * won't combine EQ and NE in same CF:
> -                */
> -               ctx->cf = NULL;
>                 break;
>         case TGSI_OPCODE_ENDIF:
>                 pop_predicate(ctx);
>                 break;
>         case TGSI_OPCODE_F2I:
> -               instr = ir2_instr_create_alu(cf, TRUNCv, ~0);
> +               instr = ir2_instr_create_alu_v(ctx->so->ir, TRUNCv);
>                 add_regs_vector_1(ctx, inst, instr);
>                 break;
>         default:
> @@ -1179,8 +1089,6 @@ compile_instructions(struct fd2_compile_context *ctx)
>                         break;
>                 }
>         }
> -
> -       ctx->cf->cf_type = EXEC_END;
>  }
>
>  int
> diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
> index 834a7c7fcd..34622eaba0 100644
> --- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
> +++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
> @@ -199,7 +199,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
>                 instr->fetch.offset = elem->src_offset;
>
>                 for (j = 0; j < 4; j++)
> -                       instr->regs[0]->swizzle[j] = "xyzw01__"[desc->swizzle[j]];
> +                       instr->dst_reg.swizzle[j] = "xyzw01__"[desc->swizzle[j]];
>
>                 assert(instr->fetch.fmt != ~0);
>
> @@ -210,7 +210,7 @@ patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
>                                 instr->fetch.const_idx,
>                                 instr->fetch.const_idx_sel,
>                                 elem->instance_divisor,
> -                               instr->regs[0]->swizzle,
> +                               instr->dst_reg.swizzle,
>                                 instr->fetch.stride,
>                                 instr->fetch.offset);
>         }
> @@ -307,7 +307,6 @@ static struct fd2_shader_stateobj *
>  create_blit_fp(void)
>  {
>         struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
> -       struct ir2_cf *cf;
>         struct ir2_instruction *instr;
>
>         if (!so)
> @@ -315,18 +314,13 @@ create_blit_fp(void)
>
>         so->ir = ir2_shader_create();
>
> -       cf = ir2_cf_create(so->ir, EXEC);
> -
> -       instr = ir2_instr_create_tex_fetch(cf, 0);
> -       ir2_reg_create(instr, 0, "xyzw", 0);
> -       ir2_reg_create(instr, 0, "xyx", 0);
> +       instr = ir2_instr_create_tex_fetch(so->ir, 0);
> +       ir2_dst_create(instr, 0, "xyzw", 0);
> +       ir2_reg_create(instr, 0, "xyx", IR2_REG_INPUT);
>         instr->sync = true;
>
> -       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> -       cf = ir2_cf_create(so->ir, EXEC_END);
> -
> -       instr = ir2_instr_create_alu(cf, MAXv, ~0);
> -       ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
> +       instr = ir2_instr_create_alu_v(so->ir, MAXv);
> +       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
>         ir2_reg_create(instr, 0, NULL, 0);
>         ir2_reg_create(instr, 0, NULL, 0);
>
> @@ -349,7 +343,6 @@ static struct fd2_shader_stateobj *
>  create_blit_vp(void)
>  {
>         struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
> -       struct ir2_cf *cf;
>         struct ir2_instruction *instr;
>
>         if (!so)
> @@ -357,31 +350,23 @@ create_blit_vp(void)
>
>         so->ir = ir2_shader_create();
>
> -       cf = ir2_cf_create(so->ir, EXEC);
> -
> -       instr = ir2_instr_create_vtx_fetch(cf, 26, 1, FMT_32_32_FLOAT, false, 8);
> +       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 1, FMT_32_32_FLOAT, false, 8);
>         instr->fetch.is_normalized = true;
> -       ir2_reg_create(instr, 1, "xy01", 0);
> -       ir2_reg_create(instr, 0, "x", 0);
> +       ir2_dst_create(instr, 1, "xy01", 0);
> +       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> -       instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, false, 12);
> +       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
>         instr->fetch.is_normalized = true;
> -       ir2_reg_create(instr, 2, "xyz1", 0);
> -       ir2_reg_create(instr, 0, "x", 0);
> -
> -       cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0);
> -       cf = ir2_cf_create(so->ir, EXEC);
> +       ir2_dst_create(instr, 2, "xyz1", 0);
> +       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> -       instr = ir2_instr_create_alu(cf, MAXv, ~0);
> -       ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT);
> +       instr = ir2_instr_create_alu_v(so->ir, MAXv);
> +       ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
>         ir2_reg_create(instr, 2, NULL, 0);
>         ir2_reg_create(instr, 2, NULL, 0);
>
> -       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> -       cf = ir2_cf_create(so->ir, EXEC_END);
> -
> -       instr = ir2_instr_create_alu(cf, MAXv, ~0);
> -       ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
> +       instr = ir2_instr_create_alu_v(so->ir, MAXv);
> +       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
>         ir2_reg_create(instr, 1, NULL, 0);
>         ir2_reg_create(instr, 1, NULL, 0);
>
> @@ -397,7 +382,6 @@ static struct fd2_shader_stateobj *
>  create_solid_fp(void)
>  {
>         struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
> -       struct ir2_cf *cf;
>         struct ir2_instruction *instr;
>
>         if (!so)
> @@ -405,11 +389,8 @@ create_solid_fp(void)
>
>         so->ir = ir2_shader_create();
>
> -       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> -       cf = ir2_cf_create(so->ir, EXEC_END);
> -
> -       instr = ir2_instr_create_alu(cf, MAXv, ~0);
> -       ir2_reg_create(instr, 0, NULL, IR2_REG_EXPORT);
> +       instr = ir2_instr_create_alu_v(so->ir, MAXv);
> +       ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
>         ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
>         ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
>
> @@ -430,7 +411,6 @@ static struct fd2_shader_stateobj *
>  create_solid_vp(void)
>  {
>         struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
> -       struct ir2_cf *cf;
>         struct ir2_instruction *instr;
>
>         if (!so)
> @@ -438,22 +418,15 @@ create_solid_vp(void)
>
>         so->ir = ir2_shader_create();
>
> -       cf = ir2_cf_create(so->ir, EXEC);
> -
> -       instr = ir2_instr_create_vtx_fetch(cf, 26, 0, FMT_32_32_32_FLOAT, false, 12);
> -       ir2_reg_create(instr, 1, "xyz1", 0);
> -       ir2_reg_create(instr, 0, "x", 0);
> -
> -       cf = ir2_cf_create_alloc(so->ir, SQ_POSITION, 0);
> -       cf = ir2_cf_create(so->ir, EXEC);
> +       instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
> +       ir2_dst_create(instr, 1, "xyz1", 0);
> +       ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
>
> -       instr = ir2_instr_create_alu(cf, MAXv, ~0);
> -       ir2_reg_create(instr, 62, NULL, IR2_REG_EXPORT);
> +       instr = ir2_instr_create_alu_v(so->ir, MAXv);
> +       ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
>         ir2_reg_create(instr, 1, NULL, 0);
>         ir2_reg_create(instr, 1, NULL, 0);
>
> -       cf = ir2_cf_create_alloc(so->ir, SQ_PARAMETER_PIXEL, 0);
> -       cf = ir2_cf_create(so->ir, EXEC_END);
>
>         return assemble(so);
>  }
> diff --git a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
> index ac972ed35a..5a9f93ec79 100644
> --- a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
> +++ b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
> @@ -147,15 +147,25 @@ typedef struct PACKED {
>         uint8_t             const_1_rel_abs          : 1;
>         uint8_t             const_0_rel_abs          : 1;
>         /* dword2: */
> -       uint8_t             src3_reg                 : 6;
> -       uint8_t             src3_reg_select          : 1;
> -       uint8_t             src3_reg_abs             : 1;
> -       uint8_t             src2_reg                 : 6;
> -       uint8_t             src2_reg_select          : 1;
> -       uint8_t             src2_reg_abs             : 1;
> -       uint8_t             src1_reg                 : 6;
> -       uint8_t             src1_reg_select          : 1;
> -       uint8_t             src1_reg_abs             : 1;
> +       union {
> +               struct {
> +                       uint8_t             src3_reg         : 6;
> +                       uint8_t             src3_reg_select  : 1;
> +                       uint8_t             src3_reg_abs     : 1;
> +                       uint8_t             src2_reg         : 6;
> +                       uint8_t             src2_reg_select  : 1;
> +                       uint8_t             src2_reg_abs     : 1;
> +                       uint8_t             src1_reg         : 6;
> +                       uint8_t             src1_reg_select  : 1;
> +                       uint8_t             src1_reg_abs     : 1;
> +               };
> +               /* constants have full 8-bit index */
> +               struct {
> +                       uint8_t             src3_reg_const   : 8;
> +                       uint8_t             src2_reg_const   : 8;
> +                       uint8_t             src1_reg_const   : 8;
> +               };
> +       };
>         instr_vector_opc_t  vector_opc               : 5;
>         uint8_t             src3_sel                 : 1;
>         uint8_t             src2_sel                 : 1;
> diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
> index 42a9ab494e..af9811864f 100644
> --- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
> +++ b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
> @@ -35,19 +35,13 @@
>  #define WARN_MSG(f, ...)   DBG("WARN:  "f, ##__VA_ARGS__)
>  #define ERROR_MSG(f, ...)  DBG("ERROR: "f, ##__VA_ARGS__)
>
> -#define REG_MASK 0x3f
> -
> -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr);
> -
>  static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
>                 uint32_t idx, struct ir2_shader_info *info);
>
> -static void reg_update_stats(struct ir2_register *reg,
> -               struct ir2_shader_info *info, bool dest);
> -static uint32_t reg_fetch_src_swiz(struct ir2_register *reg, uint32_t n);
> -static uint32_t reg_fetch_dst_swiz(struct ir2_register *reg);
> -static uint32_t reg_alu_dst_swiz(struct ir2_register *reg);
> -static uint32_t reg_alu_src_swiz(struct ir2_register *reg);
> +static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n);
> +static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg);
> +static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg);
> +static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg);
>
>  /* simple allocator to carve allocations out of an up-front allocated heap,
>   * so that we can free everything easily in one shot.
> @@ -55,7 +49,7 @@ static uint32_t reg_alu_src_swiz(struct ir2_register *reg);
>  static void * ir2_alloc(struct ir2_shader *shader, int sz)
>  {
>         void *ptr = &shader->heap[shader->heap_idx];
> -       shader->heap_idx += align(sz, 4);
> +       shader->heap_idx += align(sz, 4) / 4;
>         return ptr;
>  }
>
> @@ -74,7 +68,9 @@ static char * ir2_strdup(struct ir2_shader *shader, const char *str)
>  struct ir2_shader * ir2_shader_create(void)
>  {
>         DEBUG_MSG("");
> -       return calloc(1, sizeof(struct ir2_shader));
> +       struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader));
> +       shader->max_reg = -1;
> +       return shader;
>  }
>
>  void ir2_shader_destroy(struct ir2_shader *shader)
> @@ -83,189 +79,344 @@ void ir2_shader_destroy(struct ir2_shader *shader)
>         free(shader);
>  }
>
> -/* resolve addr/cnt/sequence fields in the individual CF's */
> -static int shader_resolve(struct ir2_shader *shader, struct ir2_shader_info *info)
> +/* check if an instruction is a simple MOV
> + */
> +static struct ir2_instruction * simple_mov(struct ir2_instruction *instr,
> +               bool output)
>  {
> -       uint32_t addr;
> -       unsigned i;
> -       int j;
> -
> -       addr = shader->cfs_count / 2;
> -       for (i = 0; i < shader->cfs_count; i++) {
> -               struct ir2_cf *cf = shader->cfs[i];
> -               if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
> -                       uint32_t sequence = 0;
> -
> -                       if (cf->exec.addr && (cf->exec.addr != addr))
> -                               WARN_MSG("invalid addr '%d' at CF %d", cf->exec.addr, i);
> -                       if (cf->exec.cnt && (cf->exec.cnt != cf->exec.instrs_count))
> -                               WARN_MSG("invalid cnt '%d' at CF %d", cf->exec.cnt, i);
> -
> -                       for (j = cf->exec.instrs_count - 1; j >= 0; j--) {
> -                               struct ir2_instruction *instr = cf->exec.instrs[j];
> -                               sequence <<= 2;
> -                               if (instr->instr_type == IR2_FETCH)
> -                                       sequence |= 0x1;
> -                               if (instr->sync)
> -                                       sequence |= 0x2;
> -                       }
> +    struct ir2_src_register *src_reg = instr->src_reg;
> +    struct ir2_dst_register *dst_reg = &instr->dst_reg;
> +    struct ir2_register *reg;
> +    unsigned i;
> +
> +    /* MAXv used for MOV */
> +    if (instr->instr_type != IR2_ALU_VECTOR ||
> +               instr->alu_vector.opc != MAXv)
> +               return NULL;
> +
> +       /* non identical srcs */
> +       if (src_reg[0].num != src_reg[1].num)
> +               return NULL;
> +
> +       /* flags */
> +       int flags = IR2_REG_NEGATE | IR2_REG_ABS;
> +       if (output)
> +               flags |= IR2_REG_INPUT | IR2_REG_CONST;
> +       if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags))
> +               return NULL;
> +
> +       /* clamping */
> +       if (instr->alu_vector.clamp)
> +               return NULL;
> +
> +       /* swizzling */
> +    for (i = 0; i < 4; i++) {
> +               char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i];
> +               if (swiz == '_')
> +                       continue;
> +
> +               if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] ||
> +                       swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i])
> +                       return NULL;
> +    }
> +
> +    if (output)
> +               reg = &instr->shader->reg[src_reg[0].num];
> +       else
> +               reg = &instr->shader->reg[dst_reg->num];
> +
> +       assert(reg->write_idx >= 0);
> +    if (reg->write_idx != reg->write_idx2)
> +               return NULL;
> +
> +       if (!output)
> +               return instr;
> +
> +       instr = instr->shader->instr[reg->write_idx];
> +       return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr;
> +}
>
> -                       cf->exec.addr = addr;
> -                       cf->exec.cnt  = cf->exec.instrs_count;
> -                       cf->exec.sequence = sequence;
> +static int src_to_reg(struct ir2_instruction *instr,
> +               struct ir2_src_register *reg)
> +{
> +       if (reg->flags & IR2_REG_CONST)
> +               return reg->num;
>
> -                       addr += cf->exec.instrs_count;
> -               }
> -       }
> +       return instr->shader->reg[reg->num].reg;
> +}
> +
> +static int dst_to_reg(struct ir2_instruction *instr,
> +               struct ir2_dst_register *reg)
> +{
> +       if (reg->flags & IR2_REG_EXPORT)
> +               return reg->num;
>
> -       info->sizedwords = 3 * addr;
> +       return instr->shader->reg[reg->num].reg;
> +}
>
> -       return 0;
> +static bool mask_get(uint32_t *mask, unsigned index)
> +{
> +    return !!(mask[index / 32] & 1 << index % 32);
>  }
>
> -void * ir2_shader_assemble(struct ir2_shader *shader, struct ir2_shader_info *info)
> +static void mask_set(uint32_t *mask, struct ir2_register *reg, int index)
>  {
> -       uint32_t i, j;
> -       uint32_t *ptr, *dwords = NULL;
> -       uint32_t idx = 0;
> -       int ret;
> -
> -       info->sizedwords    = 0;
> -       info->max_reg       = -1;
> -       info->max_input_reg = 0;
> -       info->regs_written  = 0;
> -
> -       /* we need an even # of CF's.. insert a NOP if needed */
> -       if (shader->cfs_count != align(shader->cfs_count, 2))
> -               ir2_cf_create(shader, NOP);
> -
> -       /* first pass, resolve sizes and addresses: */
> -       ret = shader_resolve(shader, info);
> -       if (ret) {
> -               ERROR_MSG("resolve failed: %d", ret);
> -               goto fail;
> +       if (reg) {
> +               unsigned i;
> +               for (i = 0; i < ARRAY_SIZE(reg->regmask); i++)
> +                       mask[i] |= reg->regmask[i];
>         }
> +       if (index >= 0)
> +               mask[index / 32] |= 1 << index % 32;
> +}
>
> -       ptr = dwords = calloc(4, info->sizedwords);
> +static bool sets_pred(struct ir2_instruction *instr)
> +{
> +    return instr->instr_type == IR2_ALU_SCALAR &&
> +               instr->alu_scalar.opc >= PRED_SETEs &&
> +               instr->alu_scalar.opc <= PRED_SET_RESTOREs;
> +}
>
> -       /* second pass, emit CF program in pairs: */
> -       for (i = 0; i < shader->cfs_count; i += 2) {
> -               instr_cf_t *cfs = (instr_cf_t *)ptr;
> -               ret = cf_emit(shader->cfs[i], &cfs[0]);
> -               if (ret) {
> -                       ERROR_MSG("CF emit failed: %d\n", ret);
> -                       goto fail;
> +
> +
> +void* ir2_shader_assemble(struct ir2_shader *shader,
> +               struct ir2_shader_info *info)
> +{
> +       /* NOTES
> +        * blob compiler seems to always puts PRED_* instrs in a CF by
> +        * themselves, and wont combine EQ/NE in the same CF
> +        * (not doing this - doesn't seem to make a difference)
> +        *
> +        * TODO: implement scheduling for combining vector+scalar instructions
> +        * -some vector instructions can be replaced by scalar
> +        */
> +
> +       /* first step:
> +        * 1. remove "NOP" MOV instructions generated by TGSI for input/output:
> +        * 2. track information for register allocation, and to remove
> +        * the dead code when some exports are not needed
> +        * 3. add additional instructions for a20x hw binning if needed
> +        * NOTE: modifies the shader instrs
> +        * this step could be done as instructions are added by compiler instead
> +        */
> +
> +       /* mask of exports that must be generated
> +        * used to avoid calculating ps exports with hw binning
> +       */
> +       uint64_t export = ~0ull;
> +       /* bitmask of variables required for exports defined by "export" */
> +       uint32_t export_mask[REG_MASK/32+1] = {};
> +
> +       unsigned idx, reg_idx;
> +       unsigned max_input = 0;
> +       int export_size = -1;
> +
> +       for (idx = 0; idx < shader->instr_count; idx++) {
> +               struct ir2_instruction *instr = shader->instr[idx], *prev;
> +               struct ir2_dst_register dst_reg = instr->dst_reg;
> +
> +               if (dst_reg.flags & IR2_REG_EXPORT) {
> +                       if (dst_reg.num < 32)
> +                               export_size++;
> +
> +                       if ((prev = simple_mov(instr, true))) {
> +                               /* copy instruction but keep dst */
> +                               *instr = *prev;
> +                               instr->dst_reg = dst_reg;
> +                       }
>                 }
> -               ret = cf_emit(shader->cfs[i+1], &cfs[1]);
> -               if (ret) {
> -                       ERROR_MSG("CF emit failed: %d\n", ret);
> -                       goto fail;
> +
> +               for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) {
> +                       struct ir2_src_register *src_reg = &instr->src_reg[reg_idx];
> +                       struct ir2_register *reg;
> +                       int num;
> +
> +                       if (src_reg->flags & IR2_REG_CONST)
> +                               continue;
> +
> +                       num = src_reg->num;
> +                       reg = &shader->reg[num];
> +                       reg->read_idx = idx;
> +
> +                       if (src_reg->flags & IR2_REG_INPUT) {
> +                               max_input = MAX2(max_input, num);
> +                       } else {
> +                               /* bypass simple mov used to set src_reg */
> +                               assert(reg->write_idx >= 0);
> +                               prev = shader->instr[reg->write_idx];
> +                               if (simple_mov(prev, false)) {
> +                                       *src_reg = prev->src_reg[0];
> +                                       /* process same src_reg again */
> +                                       reg_idx -= 1;
> +                                       continue;
> +                               }
> +                       }
> +
> +                       /* update dependencies */
> +                       uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ?
> +                                       export_mask : shader->reg[dst_reg.num].regmask;
> +                       mask_set(mask, reg, num);
> +                       if (sets_pred(instr))
> +                               mask_set(export_mask, reg, num);
>                 }
> -               ptr += 3;
> -               assert((ptr - dwords) <= info->sizedwords);
>         }
>
> -       /* third pass, emit ALU/FETCH: */
> -       for (i = 0; i < shader->cfs_count; i++) {
> -               struct ir2_cf *cf = shader->cfs[i];
> -               if ((cf->cf_type == EXEC) || (cf->cf_type == EXEC_END)) {
> -                       for (j = 0; j < cf->exec.instrs_count; j++) {
> -                               ret = instr_emit(cf->exec.instrs[j], ptr, idx++, info);
> -                               if (ret) {
> -                                       ERROR_MSG("instruction emit failed: %d", ret);
> -                                       goto fail;
> -                               }
> -                               ptr += 3;
> -                               assert((ptr - dwords) <= info->sizedwords);
> +       /* second step:
> +        * emit instructions (with CFs) + RA
> +        */
> +       instr_cf_t cfs[128], *cf = cfs;
> +       uint32_t alufetch[3*256], *af = alufetch;
> +
> +       /* RA is done on write, so inputs must be allocated here */
> +       for (reg_idx = 0; reg_idx <= max_input; reg_idx++)
> +               shader->reg[reg_idx].reg = reg_idx;
> +       info->max_reg = max_input;
> +
> +       /* CF instr state */
> +       instr_cf_exec_t exec = { .opc = EXEC };
> +       instr_cf_alloc_t alloc = { .opc = ALLOC };
> +       bool need_alloc = 0;
> +       bool pos_export = 0;
> +
> +       export_size = MAX2(export_size, 0);
> +
> +       for (idx = 0; idx < shader->instr_count; idx++) {
> +               struct ir2_instruction *instr = shader->instr[idx];
> +               struct ir2_dst_register *dst_reg = &instr->dst_reg;
> +               unsigned num = dst_reg->num;
> +               struct ir2_register *reg;
> +
> +               /* a2xx only has 64 registers, so we can use a single 64-bit mask */
> +               uint64_t regmask = 0ull;
> +
> +               /* compute the current regmask */
> +               for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) {
> +                       reg = &shader->reg[reg_idx];
> +                       if ((int) idx > reg->write_idx && idx < reg->read_idx)
> +                               regmask |= (1ull << reg->reg);
> +               }
> +
> +               if (dst_reg->flags & IR2_REG_EXPORT) {
> +                       /* skip if export is not needed */
> +                       if (!(export & (1ull << num)))
> +                               continue;
> +
> +            /* ALLOC CF:
> +             * want to alloc all < 32 at once
> +                        * 32/33 and 62/63 come in pairs
> +                        * XXX assuming all 3 types are never interleaved
> +                        */
> +            if (num < 32) {
> +                               alloc.size = export_size;
> +                               alloc.buffer_select = SQ_PARAMETER_PIXEL;
> +                               need_alloc = export_size >= 0;
> +                               export_size = -1;
> +                       } else if (num == 32 || num == 33) {
> +                               alloc.size = 0;
> +                               alloc.buffer_select = SQ_MEMORY;
> +                               need_alloc = num != 33;
> +                       } else {
> +                               alloc.size = 0;
> +                               alloc.buffer_select = SQ_POSITION;
> +                               need_alloc = !pos_export;
> +                               pos_export = true;
>                         }
> +
> +               } else {
> +                       /* skip if dst register not needed to compute exports */
> +                       if (!mask_get(export_mask, num))
> +                               continue;
> +
> +                       /* RA on first write */
> +                       reg = &shader->reg[num];
> +                       if (reg->write_idx == idx) {
> +                               reg->reg = ffsll(~regmask) - 1;
> +                               info->max_reg = MAX2(info->max_reg, reg->reg);
> +                       }
> +               }
> +
> +               if (exec.count == 6 || (exec.count && need_alloc)) {
> +                       *cf++ = *(instr_cf_t*) &exec;
> +                       exec.address += exec.count;
> +                       exec.serialize = 0;
> +                       exec.count = 0;
>                 }
> +
> +               if (need_alloc) {
> +                       *cf++ = *(instr_cf_t*) &alloc;
> +                       need_alloc = false;
> +               }
> +
> +               int ret = instr_emit(instr, af, idx, info); af += 3;
> +               assert(!ret);
> +
> +               if (instr->instr_type == IR2_FETCH)
> +                       exec.serialize |= 0x1 << exec.count * 2;
> +               if (instr->sync)
> +                       exec.serialize |= 0x2 << exec.count * 2;
> +                exec.count += 1;
>         }
>
> -       return dwords;
>
> -fail:
> -       free(dwords);
> -       return NULL;
> -}
> +       exec.opc = !export_size ? EXEC : EXEC_END;
> +       *cf++ = *(instr_cf_t*) &exec;
> +       exec.address += exec.count;
> +       exec.serialize = 0;
> +       exec.count = 0;
>
> +       /* GPU will hang without at least one pixel alloc */
> +       if (!export_size) {
> +               alloc.size = 0;
> +               alloc.buffer_select = SQ_PARAMETER_PIXEL;
> +               *cf++ = *(instr_cf_t*) &alloc;
>
> -struct ir2_cf * ir2_cf_create(struct ir2_shader *shader, instr_cf_opc_t cf_type)
> -{
> -       struct ir2_cf *cf = ir2_alloc(shader, sizeof(struct ir2_cf));
> -       DEBUG_MSG("%d", cf_type);
> -       cf->shader = shader;
> -       cf->cf_type = cf_type;
> -       assert(shader->cfs_count < ARRAY_SIZE(shader->cfs));
> -       shader->cfs[shader->cfs_count++] = cf;
> -       return cf;
> -}
> +               exec.opc = EXEC_END;
> +               *cf++ = *(instr_cf_t*) &exec;
> +       }
>
> +       unsigned num_cfs = cf - cfs;
>
> -/*
> - * CF instructions:
> - */
> +       /* insert nop to get an even # of CFs */
> +       if (num_cfs % 2) {
> +               *cf++ = (instr_cf_t) { .opc = NOP };
> +               num_cfs++;
> +       }
>
> -static int cf_emit(struct ir2_cf *cf, instr_cf_t *instr)
> -{
> -       memset(instr, 0, sizeof(*instr));
> -
> -       instr->opc = cf->cf_type;
> -
> -       switch (cf->cf_type) {
> -       case NOP:
> -               break;
> -       case EXEC:
> -       case EXEC_END:
> -               assert(cf->exec.addr <= 0x1ff);
> -               assert(cf->exec.cnt <= 0x6);
> -               assert(cf->exec.sequence <= 0xfff);
> -               instr->exec.address = cf->exec.addr;
> -               instr->exec.count = cf->exec.cnt;
> -               instr->exec.serialize = cf->exec.sequence;
> -               break;
> -       case ALLOC:
> -               assert(cf->alloc.size <= 0xf);
> -               instr->alloc.size = cf->alloc.size;
> -               switch (cf->alloc.type) {
> -               case SQ_POSITION:
> -               case SQ_PARAMETER_PIXEL:
> -                       instr->alloc.buffer_select = cf->alloc.type;
> +       /* offset cf addrs */
> +       for (idx = 0; idx < num_cfs; idx++) {
> +        switch (cfs[idx].opc) {
> +               case EXEC:
> +               case EXEC_END:
> +                       cfs[idx].exec.address += num_cfs / 2;
>                         break;
>                 default:
> -                       ERROR_MSG("invalid alloc type: %d", cf->alloc.type);
> -                       return -1;
> +                       break;
> +               /* XXX  and any other address using cf that gets implemented */
>                 }
> -               break;
> -       case COND_EXEC:
> -       case COND_EXEC_END:
> -       case COND_PRED_EXEC:
> -       case COND_PRED_EXEC_END:
> -       case LOOP_START:
> -       case LOOP_END:
> -       case COND_CALL:
> -       case RETURN:
> -       case COND_JMP:
> -       case COND_EXEC_PRED_CLEAN:
> -       case COND_EXEC_PRED_CLEAN_END:
> -       case MARK_VS_FETCH_DONE:
> -               ERROR_MSG("TODO");
> -               return -1;
>         }
>
> -       return 0;
> +       /* concatenate cfs+alufetchs */
> +       uint32_t cfdwords = num_cfs / 2 * 3;
> +       uint32_t alufetchdwords = exec.address * 3;
> +       info->sizedwords = cfdwords + alufetchdwords;
> +       uint32_t *dwords = malloc(info->sizedwords * 4);
> +       assert(dwords);
> +       memcpy(dwords, cfs, cfdwords * 4);
> +       memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4);
> +       return dwords;
>  }
>
> -
> -struct ir2_instruction * ir2_instr_create(struct ir2_cf *cf, int instr_type)
> +struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
> +               int instr_type)
>  {
>         struct ir2_instruction *instr =
> -                       ir2_alloc(cf->shader, sizeof(struct ir2_instruction));
> +                       ir2_alloc(shader, sizeof(struct ir2_instruction));
>         DEBUG_MSG("%d", instr_type);
> -       instr->shader = cf->shader;
> -       instr->pred = cf->shader->pred;
> +       instr->shader = shader;
> +       instr->idx = shader->instr_count;
> +       instr->pred = shader->pred;
>         instr->instr_type = instr_type;
> -       assert(cf->exec.instrs_count < ARRAY_SIZE(cf->exec.instrs));
> -       cf->exec.instrs[cf->exec.instrs_count++] = instr;
> +       shader->instr[shader->instr_count++] = instr;
>         return instr;
>  }
>
> @@ -279,15 +430,11 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
>                 struct ir2_shader_info *info)
>  {
>         instr_fetch_t *fetch = (instr_fetch_t *)dwords;
> -       int reg = 0;
> -       struct ir2_register *dst_reg = instr->regs[reg++];
> -       struct ir2_register *src_reg = instr->regs[reg++];
> +       struct ir2_dst_register *dst_reg = &instr->dst_reg;
> +       struct ir2_src_register *src_reg = &instr->src_reg[0];
>
>         memset(fetch, 0, sizeof(*fetch));
>
> -       reg_update_stats(dst_reg, info, true);
> -       reg_update_stats(src_reg, info, false);
> -
>         fetch->opc = instr->fetch.opc;
>
>         if (instr->fetch.opc == VTX_FETCH) {
> @@ -298,9 +445,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
>                 assert(instr->fetch.const_idx <= 0x1f);
>                 assert(instr->fetch.const_idx_sel <= 0x3);
>
> -               vtx->src_reg = src_reg->num;
> +               vtx->src_reg = src_to_reg(instr, src_reg);
>                 vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1);
> -               vtx->dst_reg = dst_reg->num;
> +               vtx->dst_reg = dst_to_reg(instr, dst_reg);
>                 vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg);
>                 vtx->must_be_one = 1;
>                 vtx->const_index = instr->fetch.const_idx;
> @@ -326,9 +473,9 @@ static int instr_emit_fetch(struct ir2_instruction *instr,
>
>