[Mesa-dev] [PATCH 9/9] freedreno: a2xx: NIR backend

Tue Nov 13 20:04:11 UTC 2018

This patch adds several new features:
-gl_FrontFacing, gl_FragCoord, gl_PointCoord, gl_PointSize
-control flow (loops), but there are issues with nested predicates
 (nested predicates were completely broken before)
-texture related features (LOD/bias, cubemaps)

And generally much better shaders because of NIR optimizations and using
the scalar slot for both the special scalar-only ops and regular vec1 ops.

It needs some nir fixes (1-bit bools patches+integer fixes in glsl_to_nir),
but it works without them.

Different instruction order can lead shorter shaders because of different
opportunities to insert instructions in the scalar slot. The instruction
scheduling has some dumb heuristics for that.

I started with simple register allocation then tried to make it work with
control flow, so it might be broken in some edge cases. Register allocation
might need a rework at some point.

Signed-off-by: Jonathan Marek <jonathan at marek.ca>
---
 .../drivers/freedreno/Makefile.sources        |   12 +-
 .../drivers/freedreno/a2xx/fd2_compiler.c     | 1119 ----------------
 .../drivers/freedreno/a2xx/fd2_compiler.h     |   36 -
 src/gallium/drivers/freedreno/a2xx/fd2_draw.c |    4 +-
 src/gallium/drivers/freedreno/a2xx/fd2_emit.c |   16 +-
 src/gallium/drivers/freedreno/a2xx/fd2_gmem.c |   15 +-
 .../drivers/freedreno/a2xx/fd2_program.c      |  496 +++----
 .../drivers/freedreno/a2xx/fd2_program.h      |   47 +-
 .../drivers/freedreno/a2xx/instr-a2xx.h       |   19 +-
 src/gallium/drivers/freedreno/a2xx/ir-a2xx.c  |  809 -----------
 src/gallium/drivers/freedreno/a2xx/ir-a2xx.h  |  188 ---
 src/gallium/drivers/freedreno/a2xx/ir2.c      |  422 ++++++
 src/gallium/drivers/freedreno/a2xx/ir2.h      |   78 ++
 .../drivers/freedreno/a2xx/ir2_assemble.c     |  546 ++++++++
 src/gallium/drivers/freedreno/a2xx/ir2_nir.c  | 1184 +++++++++++++++++
 .../freedreno/a2xx/ir2_nir_lower_scalar.c     |  174 +++
 .../drivers/freedreno/a2xx/ir2_private.h      |  393 ++++++
 src/gallium/drivers/freedreno/a2xx/ir2_ra.c   |  226 ++++
 .../freedreno/a2xx/ir2_substitutions.c        |  226 ++++
 .../drivers/freedreno/freedreno_context.h     |    8 -
 .../drivers/freedreno/freedreno_program.c     |    9 +-
 .../drivers/freedreno/freedreno_screen.c      |   21 +-
 src/gallium/drivers/freedreno/meson.build     |   12 +-
 23 files changed, 3541 insertions(+), 2519 deletions(-)
 delete mode 100644 src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
 delete mode 100644 src/gallium/drivers/freedreno/a2xx/fd2_compiler.h
 delete mode 100644 src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
 delete mode 100644 src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2.c
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2.h
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_assemble.c
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_nir.c
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_nir_lower_scalar.c
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_private.h
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_ra.c
 create mode 100644 src/gallium/drivers/freedreno/a2xx/ir2_substitutions.c

diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index 4d4644f96b..7f2b8e7b7d 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -62,8 +62,6 @@ a2xx_SOURCES := \
 	a2xx/disasm-a2xx.c \
 	a2xx/fd2_blend.c \
 	a2xx/fd2_blend.h \
-	a2xx/fd2_compiler.c \
-	a2xx/fd2_compiler.h \
 	a2xx/fd2_context.c \
 	a2xx/fd2_context.h \
 	a2xx/fd2_draw.c \
@@ -87,8 +85,14 @@ a2xx_SOURCES := \
 	a2xx/fd2_zsa.c \
 	a2xx/fd2_zsa.h \
 	a2xx/instr-a2xx.h \
-	a2xx/ir-a2xx.c \
-	a2xx/ir-a2xx.h
+	a2xx/ir2.c \
+	a2xx/ir2_nir.c \
+	a2xx/ir2_nir_lower_scalar.c \
+	a2xx/ir2_substitutions.c \
+	a2xx/ir2_ra.c \
+	a2xx/ir2_assemble.c \
+	a2xx/ir2_private.h \
+	a2xx/ir2.h
 
 a3xx_SOURCES := \
 	a3xx/a3xx.xml.h \
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
deleted file mode 100644
index 156bfc247c..0000000000
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
+++ /dev/null
@@ -1,1119 +0,0 @@
-/*
- * Copyright (C) 2012 Rob Clark <robclark at freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark at freedesktop.org>
- */
-
-#include "pipe/p_state.h"
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_dump.h"
-
-#include "fd2_compiler.h"
-#include "fd2_program.h"
-#include "fd2_util.h"
-
-#include "instr-a2xx.h"
-#include "ir-a2xx.h"
-
-struct fd2_compile_context {
-	struct fd_program_stateobj *prog;
-	struct fd2_shader_stateobj *so;
-
-	struct tgsi_parse_context parser;
-	unsigned type;
-
-	/* predicate stack: */
-	int pred_depth;
-	enum ir2_pred pred_stack[8];
-
-	/* Internal-Temporary and Predicate register assignment:
-	 *
-	 * Some TGSI instructions which translate into multiple actual
-	 * instructions need one or more temporary registers, which are not
-	 * assigned from TGSI perspective (ie. not TGSI_FILE_TEMPORARY).
-	 * And some instructions (texture fetch) cannot write directly to
-	 * output registers.  We could be more clever and re-use dst or a
-	 * src register in some cases.  But for now don't try to be clever.
-	 * Eventually we should implement an optimization pass that re-
-	 * juggles the register usage and gets rid of unneeded temporaries.
-	 *
-	 * The predicate register must be valid across multiple TGSI
-	 * instructions, but internal temporary's do not.  For this reason,
-	 * once the predicate register is requested, until it is no longer
-	 * needed, it gets the first register slot after after the TGSI
-	 * assigned temporaries (ie. num_regs[TGSI_FILE_TEMPORARY]), and the
-	 * internal temporaries get the register slots above this.
-	 */
-
-	int pred_reg;
-	int num_internal_temps;
-
-	uint8_t num_regs[TGSI_FILE_COUNT];
-
-	/* maps input register idx to prog->export_linkage idx: */
-	uint8_t input_export_idx[64];
-
-	/* maps output register idx to prog->export_linkage idx: */
-	uint8_t output_export_idx[64];
-
-	/* idx/slot for last compiler generated immediate */
-	unsigned immediate_idx;
-
-	// TODO we can skip emit exports in the VS that the FS doesn't need..
-	// and get rid perhaps of num_param..
-	unsigned num_position, num_param;
-	unsigned position, psize;
-
-	uint64_t need_sync;
-};
-
-static int
-semantic_idx(struct tgsi_declaration_semantic *semantic)
-{
-	int idx = semantic->Name;
-	if (idx == TGSI_SEMANTIC_GENERIC)
-		idx = TGSI_SEMANTIC_COUNT + semantic->Index;
-	return idx;
-}
-
-/* assign/get the input/export register # for given semantic idx as
- * returned by semantic_idx():
- */
-static int
-export_linkage(struct fd2_compile_context *ctx, int idx)
-{
-	struct fd_program_stateobj *prog = ctx->prog;
-
-	/* if first time we've seen this export, assign the next available slot: */
-	if (prog->export_linkage[idx] == 0xff)
-		prog->export_linkage[idx] = prog->num_exports++;
-
-	return prog->export_linkage[idx];
-}
-
-static unsigned
-compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
-		struct fd2_shader_stateobj *so)
-{
-	unsigned ret;
-
-	ctx->prog = prog;
-	ctx->so = so;
-	ctx->pred_depth = 0;
-
-	ret = tgsi_parse_init(&ctx->parser, so->tokens);
-	if (ret != TGSI_PARSE_OK)
-		return ret;
-
-	ctx->type = ctx->parser.FullHeader.Processor.Processor;
-	ctx->position = ~0;
-	ctx->psize = ~0;
-	ctx->num_position = 0;
-	ctx->num_param = 0;
-	ctx->need_sync = 0;
-	ctx->immediate_idx = 0;
-	ctx->pred_reg = -1;
-	ctx->num_internal_temps = 0;
-
-	memset(ctx->num_regs, 0, sizeof(ctx->num_regs));
-	memset(ctx->input_export_idx, 0, sizeof(ctx->input_export_idx));
-	memset(ctx->output_export_idx, 0, sizeof(ctx->output_export_idx));
-
-	/* do first pass to extract declarations: */
-	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-		tgsi_parse_token(&ctx->parser);
-
-		switch (ctx->parser.FullToken.Token.Type) {
-		case TGSI_TOKEN_TYPE_DECLARATION: {
-			struct tgsi_full_declaration *decl =
-					&ctx->parser.FullToken.FullDeclaration;
-			if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
-				unsigned name = decl->Semantic.Name;
-
-				assert(decl->Declaration.Semantic);  // TODO is this ever not true?
-
-				ctx->output_export_idx[decl->Range.First] =
-						semantic_idx(&decl->Semantic);
-
-				if (ctx->type == PIPE_SHADER_VERTEX) {
-					switch (name) {
-					case TGSI_SEMANTIC_POSITION:
-						ctx->position = ctx->num_regs[TGSI_FILE_OUTPUT];
-						ctx->num_position++;
-						break;
-					case TGSI_SEMANTIC_PSIZE:
-						ctx->psize = ctx->num_regs[TGSI_FILE_OUTPUT];
-						ctx->num_position++;
-						break;
-					case TGSI_SEMANTIC_COLOR:
-					case TGSI_SEMANTIC_GENERIC:
-						ctx->num_param++;
-						break;
-					default:
-						DBG("unknown VS semantic name: %s",
-								tgsi_semantic_names[name]);
-						assert(0);
-					}
-				} else {
-					switch (name) {
-					case TGSI_SEMANTIC_COLOR:
-					case TGSI_SEMANTIC_GENERIC:
-						ctx->num_param++;
-						break;
-					default:
-						DBG("unknown PS semantic name: %s",
-								tgsi_semantic_names[name]);
-						assert(0);
-					}
-				}
-			} else if (decl->Declaration.File == TGSI_FILE_INPUT) {
-				ctx->input_export_idx[decl->Range.First] =
-						semantic_idx(&decl->Semantic);
-			}
-			ctx->num_regs[decl->Declaration.File] =
-					MAX2(ctx->num_regs[decl->Declaration.File], decl->Range.Last + 1);
-			break;
-		}
-		case TGSI_TOKEN_TYPE_IMMEDIATE: {
-			struct tgsi_full_immediate *imm =
-					&ctx->parser.FullToken.FullImmediate;
-			unsigned n = ctx->so->num_immediates++;
-			memcpy(ctx->so->immediates[n].val, imm->u, 16);
-			break;
-		}
-		default:
-			break;
-		}
-	}
-
-	/* TGSI generated immediates are always entire vec4's, ones we
-	 * generate internally are not:
-	 */
-	ctx->immediate_idx = ctx->so->num_immediates * 4;
-
-	ctx->so->first_immediate = ctx->num_regs[TGSI_FILE_CONSTANT];
-
-	tgsi_parse_free(&ctx->parser);
-
-	return tgsi_parse_init(&ctx->parser, so->tokens);
-}
-
-static void
-compile_free(struct fd2_compile_context *ctx)
-{
-	tgsi_parse_free(&ctx->parser);
-}
-
-static void
-compile_vtx_fetch(struct fd2_compile_context *ctx)
-{
-	struct ir2_instruction **vfetch_instrs = ctx->so->vfetch_instrs;
-	int i;
-	for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
-		struct ir2_instruction *instr = ir2_instr_create(
-				ctx->so->ir, IR2_FETCH);
-		instr->fetch.opc = VTX_FETCH;
-
-		ctx->need_sync |= 1 << (i+1);
-
-		ir2_dst_create(instr, i+1, "xyzw", 0);
-		ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-		if (i == 0)
-			instr->sync = true;
-
-		vfetch_instrs[i] = instr;
-	}
-	ctx->so->num_vfetch_instrs = i;
-}
-
-/*
- * For vertex shaders (VS):
- * --- ------ -------------
- *
- *   Inputs:     R1-R(num_input)
- *   Constants:  C0-C(num_const-1)
- *   Immediates: C(num_const)-C(num_const+num_imm-1)
- *   Outputs:    export0-export(n) and export62, export63
- *      n is # of outputs minus gl_Position (export62) and gl_PointSize (export63)
- *   Temps:      R(num_input+1)-R(num_input+num_temps)
- *
- * R0 could be clobbered after the vertex fetch instructions.. so we
- * could use it for one of the temporaries.
- *
- * TODO: maybe the vertex fetch part could fetch first input into R0 as
- * the last vtx fetch instruction, which would let us use the same
- * register layout in either case.. although this is not what the blob
- * compiler does.
- *
- *
- * For frag shaders (PS):
- * --- ---- -------------
- *
- *   Inputs:     R0-R(num_input-1)
- *   Constants:  same as VS
- *   Immediates: same as VS
- *   Outputs:    export0-export(num_outputs)
- *   Temps:      R(num_input)-R(num_input+num_temps-1)
- *
- * In either case, immediates are are postpended to the constants
- * (uniforms).
- *
- */
-
-static unsigned
-get_temp_gpr(struct fd2_compile_context *ctx, int idx)
-{
-	unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT];
-	if (ctx->type == PIPE_SHADER_VERTEX)
-		num++;
-	return num;
-}
-
-static struct ir2_dst_register *
-add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
-		const struct tgsi_dst_register *dst)
-{
-	unsigned flags = 0, num = 0;
-	char swiz[5];
-
-	switch (dst->File) {
-	case TGSI_FILE_OUTPUT:
-		flags |= IR2_REG_EXPORT;
-		if (ctx->type == PIPE_SHADER_VERTEX) {
-			if (dst->Index == ctx->position) {
-				num = 62;
-			} else if (dst->Index == ctx->psize) {
-				num = 63;
-			} else {
-				num = export_linkage(ctx,
-						ctx->output_export_idx[dst->Index]);
-			}
-		} else {
-			num = dst->Index;
-		}
-		break;
-	case TGSI_FILE_TEMPORARY:
-		num = get_temp_gpr(ctx, dst->Index);
-		break;
-	default:
-		DBG("unsupported dst register file: %s",
-			tgsi_file_name(dst->File));
-		assert(0);
-		break;
-	}
-
-	swiz[0] = (dst->WriteMask & TGSI_WRITEMASK_X) ? 'x' : '_';
-	swiz[1] = (dst->WriteMask & TGSI_WRITEMASK_Y) ? 'y' : '_';
-	swiz[2] = (dst->WriteMask & TGSI_WRITEMASK_Z) ? 'z' : '_';
-	swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
-	swiz[4] = '\0';
-
-	return ir2_dst_create(alu, num, swiz, flags);
-}
-
-static struct ir2_src_register *
-add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
-		const struct tgsi_src_register *src)
-{
-	static const char swiz_vals[] = {
-			'x', 'y', 'z', 'w',
-	};
-	char swiz[5];
-	unsigned flags = 0, num = 0;
-
-	switch (src->File) {
-	case TGSI_FILE_CONSTANT:
-		num = src->Index;
-		flags |= IR2_REG_CONST;
-		break;
-	case TGSI_FILE_INPUT:
-		if (ctx->type == PIPE_SHADER_VERTEX) {
-			num = src->Index + 1;
-		} else {
-			flags |= IR2_REG_INPUT;
-			num = export_linkage(ctx,
-					ctx->input_export_idx[src->Index]);
-		}
-		break;
-	case TGSI_FILE_TEMPORARY:
-		num = get_temp_gpr(ctx, src->Index);
-		break;
-	case TGSI_FILE_IMMEDIATE:
-		num = src->Index + ctx->num_regs[TGSI_FILE_CONSTANT];
-		flags |= IR2_REG_CONST;
-		break;
-	default:
-		DBG("unsupported src register file: %s",
-			tgsi_file_name(src->File));
-		assert(0);
-		break;
-	}
-
-	if (src->Absolute)
-		flags |= IR2_REG_ABS;
-	if (src->Negate)
-		flags |= IR2_REG_NEGATE;
-
-	swiz[0] = swiz_vals[src->SwizzleX];
-	swiz[1] = swiz_vals[src->SwizzleY];
-	swiz[2] = swiz_vals[src->SwizzleZ];
-	swiz[3] = swiz_vals[src->SwizzleW];
-	swiz[4] = '\0';
-
-	if ((ctx->need_sync & ((uint64_t)1 << num)) &&
-			!(flags & IR2_REG_CONST)) {
-		alu->sync = true;
-		ctx->need_sync &= ~((uint64_t)1 << num);
-	}
-
-	return ir2_reg_create(alu, num, swiz, flags);
-}
-
-static void
-add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-	if (inst->Instruction.Saturate) {
-		alu->alu_vector.clamp = true;
-	}
-}
-
-static void
-add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-	if (inst->Instruction.Saturate) {
-		alu->alu_scalar.clamp = true;
-	}
-}
-
-static void
-add_regs_vector_1(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-	assert(inst->Instruction.NumSrcRegs == 1);
-	assert(inst->Instruction.NumDstRegs == 1);
-
-	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-	add_src_reg(ctx, alu, &inst->Src[0].Register);
-	add_src_reg(ctx, alu, &inst->Src[0].Register);
-	add_vector_clamp(inst, alu);
-}
-
-static void
-add_regs_vector_2(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-	assert(inst->Instruction.NumSrcRegs == 2);
-	assert(inst->Instruction.NumDstRegs == 1);
-
-	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-	add_src_reg(ctx, alu, &inst->Src[0].Register);
-	add_src_reg(ctx, alu, &inst->Src[1].Register);
-	add_vector_clamp(inst, alu);
-}
-
-static void
-add_regs_vector_3(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-	assert(inst->Instruction.NumSrcRegs == 3);
-	assert(inst->Instruction.NumDstRegs == 1);
-
-	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-	add_src_reg(ctx, alu, &inst->Src[0].Register);
-	add_src_reg(ctx, alu, &inst->Src[1].Register);
-	add_src_reg(ctx, alu, &inst->Src[2].Register);
-	add_vector_clamp(inst, alu);
-}
-
-static void
-add_regs_scalar_1(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
-{
-	assert(inst->Instruction.NumSrcRegs == 1);
-	assert(inst->Instruction.NumDstRegs == 1);
-
-	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-	add_src_reg(ctx, alu, &inst->Src[0].Register);
-	add_scalar_clamp(inst, alu);
-}
-
-/*
- * Helpers for TGSI instructions that don't map to a single shader instr:
- */
-
-static void
-src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
-{
-	src->File      = dst->File;
-	src->Indirect  = dst->Indirect;
-	src->Dimension = dst->Dimension;
-	src->Index     = dst->Index;
-	src->Absolute  = 0;
-	src->Negate    = 0;
-	src->SwizzleX  = TGSI_SWIZZLE_X;
-	src->SwizzleY  = TGSI_SWIZZLE_Y;
-	src->SwizzleZ  = TGSI_SWIZZLE_Z;
-	src->SwizzleW  = TGSI_SWIZZLE_W;
-}
-
-/* Get internal-temp src/dst to use for a sequence of instructions
- * generated by a single TGSI op.
- */
-static void
-get_internal_temp(struct fd2_compile_context *ctx,
-		struct tgsi_dst_register *tmp_dst,
-		struct tgsi_src_register *tmp_src)
-{
-	int n;
-
-	tmp_dst->File      = TGSI_FILE_TEMPORARY;
-	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
-	tmp_dst->Indirect  = 0;
-	tmp_dst->Dimension = 0;
-
-	/* assign next temporary: */
-	n = ctx->num_internal_temps++;
-	if (ctx->pred_reg != -1)
-		n++;
-
-	tmp_dst->Index = ctx->num_regs[TGSI_FILE_TEMPORARY] + n;
-
-	src_from_dst(tmp_src, tmp_dst);
-}
-
-static void
-get_predicate(struct fd2_compile_context *ctx, struct tgsi_dst_register *dst,
-		struct tgsi_src_register *src)
-{
-	assert(ctx->pred_reg != -1);
-
-	dst->File      = TGSI_FILE_TEMPORARY;
-	dst->WriteMask = TGSI_WRITEMASK_W;
-	dst->Indirect  = 0;
-	dst->Dimension = 0;
-	dst->Index     = get_temp_gpr(ctx, ctx->pred_reg);
-
-	if (src) {
-		src_from_dst(src, dst);
-		src->SwizzleX  = TGSI_SWIZZLE_W;
-		src->SwizzleY  = TGSI_SWIZZLE_W;
-		src->SwizzleZ  = TGSI_SWIZZLE_W;
-		src->SwizzleW  = TGSI_SWIZZLE_W;
-	}
-}
-
-static void
-push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
-{
-	struct ir2_instruction *alu;
-	struct tgsi_dst_register pred_dst;
-
-	if (ctx->pred_depth == 0) {
-		/* assign predicate register: */
-		ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
-
-		get_predicate(ctx, &pred_dst, NULL);
-
-		alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SETNEs);
-		add_dst_reg(ctx, alu, &pred_dst);
-		add_src_reg(ctx, alu, src);
-	} else {
-		struct tgsi_src_register pred_src;
-
-		get_predicate(ctx, &pred_dst, &pred_src);
-
-		alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-		add_dst_reg(ctx, alu, &pred_dst);
-		add_src_reg(ctx, alu, &pred_src);
-		add_src_reg(ctx, alu, src);
-
-		// XXX need to make PRED_SETE_PUSHv IR2_PRED_NONE.. but need to make
-		// sure src reg is valid if it was calculated with a predicate
-		// condition..
-		alu->pred = IR2_PRED_NONE;
-	}
-
-	/* save previous pred state to restore in pop_predicate(): */
-	ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
-}
-
-static void
-pop_predicate(struct fd2_compile_context *ctx)
-{
-	/* restore previous predicate state: */
-	ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
-
-	if (ctx->pred_depth != 0) {
-		struct ir2_instruction *alu;
-		struct tgsi_dst_register pred_dst;
-		struct tgsi_src_register pred_src;
-
-		get_predicate(ctx, &pred_dst, &pred_src);
-
-		alu = ir2_instr_create_alu_s(ctx->so->ir, PRED_SET_POPs);
-		add_dst_reg(ctx, alu, &pred_dst);
-		add_src_reg(ctx, alu, &pred_src);
-		alu->pred = IR2_PRED_NONE;
-	} else {
-		/* predicate register no longer needed: */
-		ctx->pred_reg = -1;
-	}
-}
-
-static void
-get_immediate(struct fd2_compile_context *ctx,
-		struct tgsi_src_register *reg, uint32_t val)
-{
-	unsigned neg, swiz, idx, i;
-	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
-	static const unsigned swiz2tgsi[] = {
-			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
-	};
-
-	for (i = 0; i < ctx->immediate_idx; i++) {
-		swiz = i % 4;
-		idx  = i / 4;
-
-		if (ctx->so->immediates[idx].val[swiz] == val) {
-			neg = 0;
-			break;
-		}
-
-		if (ctx->so->immediates[idx].val[swiz] == -val) {
-			neg = 1;
-			break;
-		}
-	}
-
-	if (i == ctx->immediate_idx) {
-		/* need to generate a new immediate: */
-		swiz = i % 4;
-		idx  = i / 4;
-		neg  = 0;
-		ctx->so->immediates[idx].val[swiz] = val;
-		ctx->so->num_immediates = idx + 1;
-		ctx->immediate_idx++;
-	}
-
-	reg->File      = TGSI_FILE_IMMEDIATE;
-	reg->Indirect  = 0;
-	reg->Dimension = 0;
-	reg->Index     = idx;
-	reg->Absolute  = 0;
-	reg->Negate    = neg;
-	reg->SwizzleX  = swiz2tgsi[swiz];
-	reg->SwizzleY  = swiz2tgsi[swiz];
-	reg->SwizzleZ  = swiz2tgsi[swiz];
-	reg->SwizzleW  = swiz2tgsi[swiz];
-}
-
-/* POW(a,b) = EXP2(b * LOG2(a)) */
-static void
-translate_pow(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register tmp_src;
-	struct ir2_instruction *alu;
-
-	get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-	alu = ir2_instr_create_alu_s(ctx->so->ir, LOG_CLAMP);
-	add_dst_reg(ctx, alu, &tmp_dst);
-	add_src_reg(ctx, alu, &inst->Src[0].Register);
-
-	alu = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-	add_dst_reg(ctx, alu, &tmp_dst);
-	add_src_reg(ctx, alu, &tmp_src);
-	add_src_reg(ctx, alu, &inst->Src[1].Register);
-
-	/* NOTE: some of the instructions, like EXP_IEEE, seem hard-
-	 * coded to take their input from the w component.
-	 */
-	switch(inst->Dst[0].Register.WriteMask) {
-	case TGSI_WRITEMASK_X:
-		tmp_src.SwizzleW = TGSI_SWIZZLE_X;
-		break;
-	case TGSI_WRITEMASK_Y:
-		tmp_src.SwizzleW = TGSI_SWIZZLE_Y;
-		break;
-	case TGSI_WRITEMASK_Z:
-		tmp_src.SwizzleW = TGSI_SWIZZLE_Z;
-		break;
-	case TGSI_WRITEMASK_W:
-		tmp_src.SwizzleW = TGSI_SWIZZLE_W;
-		break;
-	default:
-		DBG("invalid writemask!");
-		assert(0);
-		break;
-	}
-
-	alu = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
-	add_dst_reg(ctx, alu, &inst->Dst[0].Register);
-	add_src_reg(ctx, alu, &tmp_src);
-	add_scalar_clamp(inst, alu);
-}
-
-static void
-translate_tex(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst, unsigned opc)
-{
-	struct ir2_instruction *instr;
-	struct ir2_src_register *reg;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register tmp_src;
-	const struct tgsi_src_register *coord;
-	bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
-			inst->Instruction.Saturate;
-	int idx;
-
-	if (using_temp || (opc == TGSI_OPCODE_TXP))
-		get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-	if (opc == TGSI_OPCODE_TXP) {
-		static const char *swiz[] = {
-				[TGSI_SWIZZLE_X] = "xxxx",
-				[TGSI_SWIZZLE_Y] = "yyyy",
-				[TGSI_SWIZZLE_Z] = "zzzz",
-				[TGSI_SWIZZLE_W] = "wwww",
-		};
-
-		/* TXP - Projective Texture Lookup:
-		 *
-		 *  coord.x = src0.x / src.w
-		 *  coord.y = src0.y / src.w
-		 *  coord.z = src0.z / src.w
-		 *  coord.w = src0.w
-		 *  bias = 0.0
-		 *
-		 *  dst = texture_sample(unit, coord, bias)
-		 */
-
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
-		add_src_reg(ctx, instr, &inst->Src[0].Register);
-		add_src_reg(ctx, instr, &inst->Src[0].Register);
-
-		instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
-		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
-		memcpy(add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle,
-			   swiz[inst->Src[0].Register.SwizzleW], 4);
-
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-		add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
-		add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
-		add_src_reg(ctx, instr, &inst->Src[0].Register);
-
-		coord = &tmp_src;
-	} else {
-		coord = &inst->Src[0].Register;
-	}
-
-	instr = ir2_instr_create(ctx->so->ir, IR2_FETCH);
-	instr->fetch.opc = TEX_FETCH;
-	instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
-	instr->fetch.is_rect = (inst->Texture.Texture == TGSI_TEXTURE_RECT);
-	assert(inst->Texture.NumOffsets <= 1); // TODO what to do in other cases?
-
-	/* save off the tex fetch to be patched later with correct const_idx: */
-	idx = ctx->so->num_tfetch_instrs++;
-	ctx->so->tfetch_instrs[idx].samp_id = inst->Src[1].Register.Index;
-	ctx->so->tfetch_instrs[idx].instr = instr;
-
-	add_dst_reg(ctx, instr, using_temp ? &tmp_dst : &inst->Dst[0].Register);
-	reg = add_src_reg(ctx, instr, coord);
-
-	/* blob compiler always sets 3rd component to same as 1st for 2d: */
-	if (inst->Texture.Texture == TGSI_TEXTURE_2D || inst->Texture.Texture == TGSI_TEXTURE_RECT)
-		reg->swizzle[2] = reg->swizzle[0];
-
-	/* dst register needs to be marked for sync: */
-	ctx->need_sync |= 1 << instr->dst_reg.num;
-
-	/* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
-	instr->sync = true;
-
-	if (using_temp) {
-		/* texture fetch can't write directly to export, so if tgsi
-		 * is telling us the dst register is in output file, we load
-		 * the texture to a temp and the use ALU instruction to move
-		 * to output
-		 */
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-
-		add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-		add_src_reg(ctx, instr, &tmp_src);
-		add_src_reg(ctx, instr, &tmp_src);
-		add_vector_clamp(inst, instr);
-	}
-}
-
-/* SGE(a,b) = GTE((b - a), 1.0, 0.0) */
-/* SLT(a,b) = GTE((b - a), 0.0, 1.0) */
-/* SEQ(a,b) = EQU((b - a), 1.0, 0.0) */
-/* SNE(a,b) = EQU((b - a), 0.0, 1.0) */
-static void
-translate_sge_slt_seq_sne(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst, unsigned opc)
-{
-	struct ir2_instruction *instr;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register tmp_src;
-	struct tgsi_src_register tmp_const;
-	float c0, c1;
-	instr_vector_opc_t vopc;
-
-	switch (opc) {
-	default:
-		assert(0);
-	case TGSI_OPCODE_SGE:
-		c0 = 1.0;
-		c1 = 0.0;
-		vopc = CNDGTEv;
-		break;
-	case TGSI_OPCODE_SLT:
-		c0 = 0.0;
-		c1 = 1.0;
-		vopc = CNDGTEv;
-		break;
-	case TGSI_OPCODE_SEQ:
-		c0 = 0.0;
-		c1 = 1.0;
-		vopc = CNDEv;
-		break;
-	case TGSI_OPCODE_SNE:
-		c0 = 1.0;
-		c1 = 0.0;
-		vopc = CNDEv;
-		break;
-	}
-
-	get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-	instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-	add_dst_reg(ctx, instr, &tmp_dst);
-	add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
-	add_src_reg(ctx, instr, &inst->Src[1].Register);
-
-	instr = ir2_instr_create_alu_v(ctx->so->ir, vopc);
-	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-	add_src_reg(ctx, instr, &tmp_src);
-	get_immediate(ctx, &tmp_const, fui(c1));
-	add_src_reg(ctx, instr, &tmp_const);
-	get_immediate(ctx, &tmp_const, fui(c0));
-	add_src_reg(ctx, instr, &tmp_const);
-}
-
-/* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
-static void
-translate_lrp(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst,
-		unsigned opc)
-{
-	struct ir2_instruction *instr;
-	struct tgsi_dst_register tmp_dst1, tmp_dst2;
-	struct tgsi_src_register tmp_src1, tmp_src2;
-	struct tgsi_src_register tmp_const;
-
-	get_internal_temp(ctx, &tmp_dst1, &tmp_src1);
-	get_internal_temp(ctx, &tmp_dst2, &tmp_src2);
-
-	get_immediate(ctx, &tmp_const, fui(1.0));
-
-	/* tmp1 = (a * b) */
-	instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-	add_dst_reg(ctx, instr, &tmp_dst1);
-	add_src_reg(ctx, instr, &inst->Src[0].Register);
-	add_src_reg(ctx, instr, &inst->Src[1].Register);
-
-	/* tmp2 = (1 - a) */
-	instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-	add_dst_reg(ctx, instr, &tmp_dst2);
-	add_src_reg(ctx, instr, &tmp_const);
-	add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
-
-	/* tmp2 = tmp2 * c */
-	instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-	add_dst_reg(ctx, instr, &tmp_dst2);
-	add_src_reg(ctx, instr, &tmp_src2);
-	add_src_reg(ctx, instr, &inst->Src[2].Register);
-
-	/* dst = tmp1 + tmp2 */
-	instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-	add_src_reg(ctx, instr, &tmp_src1);
-	add_src_reg(ctx, instr, &tmp_src2);
-}
-
-static void
-translate_trig(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst,
-		unsigned opc)
-{
-	struct ir2_instruction *instr;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register tmp_src;
-	struct tgsi_src_register tmp_const;
-	instr_scalar_opc_t op;
-
-	switch (opc) {
-	default:
-		assert(0);
-	case TGSI_OPCODE_SIN:
-		op = SIN;
-		break;
-	case TGSI_OPCODE_COS:
-		op = COS;
-		break;
-	}
-
-	get_internal_temp(ctx, &tmp_dst, &tmp_src);
-
-	tmp_dst.WriteMask = TGSI_WRITEMASK_X;
-	tmp_src.SwizzleX = tmp_src.SwizzleY =
-			tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
-
-	instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
-	add_dst_reg(ctx, instr, &tmp_dst);
-	add_src_reg(ctx, instr, &inst->Src[0].Register);
-	get_immediate(ctx, &tmp_const, fui(0.159155));
-	add_src_reg(ctx, instr, &tmp_const);
-	get_immediate(ctx, &tmp_const, fui(0.5));
-	add_src_reg(ctx, instr, &tmp_const);
-
-	instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
-	add_dst_reg(ctx, instr, &tmp_dst);
-	add_src_reg(ctx, instr, &tmp_src);
-	add_src_reg(ctx, instr, &tmp_src);
-
-	instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
-	add_dst_reg(ctx, instr, &tmp_dst);
-	add_src_reg(ctx, instr, &tmp_src);
-	get_immediate(ctx, &tmp_const, fui(6.283185));
-	add_src_reg(ctx, instr, &tmp_const);
-	get_immediate(ctx, &tmp_const, fui(-3.141593));
-	add_src_reg(ctx, instr, &tmp_const);
-
-	instr = ir2_instr_create_alu_s(ctx->so->ir, op);
-	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-	add_src_reg(ctx, instr, &tmp_src);
-}
-
-static void
-translate_dp2(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst,
-		unsigned opc)
-{
-	struct tgsi_src_register tmp_const;
-	struct ir2_instruction *instr;
-	/* DP2ADD c,a,b -> dot2(a,b) + c */
-	/* for c we use the constant 0.0 */
-	instr = ir2_instr_create_alu_v(ctx->so->ir, DOT2ADDv);
-	add_dst_reg(ctx, instr, &inst->Dst[0].Register);
-	add_src_reg(ctx, instr, &inst->Src[0].Register);
-	add_src_reg(ctx, instr, &inst->Src[1].Register);
-	get_immediate(ctx, &tmp_const, fui(0.0f));
-	add_src_reg(ctx, instr, &tmp_const);
-	add_vector_clamp(inst, instr);
-}
-
-/*
- * Main part of compiler/translator:
- */
-
-static void
-translate_instruction(struct fd2_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	unsigned opc = inst->Instruction.Opcode;
-	struct ir2_instruction *instr;
-
-	if (opc == TGSI_OPCODE_END)
-		return;
-
-	/* TODO turn this into a table: */
-	switch (opc) {
-	case TGSI_OPCODE_MOV:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-		add_regs_vector_1(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_RCP:
-		instr = ir2_instr_create_alu_s(ctx->so->ir, RECIP_IEEE);
-		add_regs_scalar_1(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_RSQ:
-		instr = ir2_instr_create_alu_s(ctx->so->ir, RECIPSQ_IEEE);
-		add_regs_scalar_1(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_SQRT:
-		instr = ir2_instr_create_alu_s(ctx->so->ir, SQRT_IEEE);
-		add_regs_scalar_1(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_MUL:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MULv);
-		add_regs_vector_2(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_ADD:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, ADDv);
-		add_regs_vector_2(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_DP2:
-		translate_dp2(ctx, inst, opc);
-		break;
-	case TGSI_OPCODE_DP3:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, DOT3v);
-		add_regs_vector_2(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_DP4:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, DOT4v);
-		add_regs_vector_2(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_MIN:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MINv);
-		add_regs_vector_2(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_MAX:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MAXv);
-		add_regs_vector_2(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_SLT:
-	case TGSI_OPCODE_SGE:
-	case TGSI_OPCODE_SEQ:
-	case TGSI_OPCODE_SNE:
-		translate_sge_slt_seq_sne(ctx, inst, opc);
-		break;
-	case TGSI_OPCODE_MAD:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, MULADDv);
-		add_regs_vector_3(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_LRP:
-		translate_lrp(ctx, inst, opc);
-		break;
-	case TGSI_OPCODE_FRC:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, FRACv);
-		add_regs_vector_1(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_FLR:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, FLOORv);
-		add_regs_vector_1(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_EX2:
-		instr = ir2_instr_create_alu_s(ctx->so->ir, EXP_IEEE);
-		add_regs_scalar_1(ctx, inst, instr);
-		break;
-	case TGSI_OPCODE_POW:
-		translate_pow(ctx, inst);
-		break;
-	case TGSI_OPCODE_COS:
-	case TGSI_OPCODE_SIN:
-		translate_trig(ctx, inst, opc);
-		break;
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXP:
-		translate_tex(ctx, inst, opc);
-		break;
-	case TGSI_OPCODE_CMP:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, CNDGTEv);
-		add_regs_vector_3(ctx, inst, instr);
-		instr->src_reg[0].flags ^= IR2_REG_NEGATE; /* src1 */
-		break;
-	case TGSI_OPCODE_IF:
-		push_predicate(ctx, &inst->Src[0].Register);
-		ctx->so->ir->pred = IR2_PRED_EQ;
-		break;
-	case TGSI_OPCODE_ELSE:
-		ctx->so->ir->pred = IR2_PRED_NE;
-		break;
-	case TGSI_OPCODE_ENDIF:
-		pop_predicate(ctx);
-		break;
-	case TGSI_OPCODE_F2I:
-		instr = ir2_instr_create_alu_v(ctx->so->ir, TRUNCv);
-		add_regs_vector_1(ctx, inst, instr);
-		break;
-	default:
-		DBG("unknown TGSI opc: %s", tgsi_get_opcode_name(opc));
-		tgsi_dump(ctx->so->tokens, 0);
-		assert(0);
-		break;
-	}
-
-	/* internal temporaries are only valid for the duration of a single
-	 * TGSI instruction:
-	 */
-	ctx->num_internal_temps = 0;
-}
-
-static void
-compile_instructions(struct fd2_compile_context *ctx)
-{
-	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-		tgsi_parse_token(&ctx->parser);
-
-		switch (ctx->parser.FullToken.Token.Type) {
-		case TGSI_TOKEN_TYPE_INSTRUCTION:
-			translate_instruction(ctx,
-					&ctx->parser.FullToken.FullInstruction);
-			break;
-		default:
-			break;
-		}
-	}
-}
-
-int
-fd2_compile_shader(struct fd_program_stateobj *prog,
-		struct fd2_shader_stateobj *so)
-{
-	struct fd2_compile_context ctx;
-
-	ir2_shader_destroy(so->ir);
-	so->ir = ir2_shader_create();
-	so->num_vfetch_instrs = so->num_tfetch_instrs = so->num_immediates = 0;
-
-	if (compile_init(&ctx, prog, so) != TGSI_PARSE_OK)
-		return -1;
-
-	if (ctx.type == PIPE_SHADER_VERTEX) {
-		compile_vtx_fetch(&ctx);
-	} else if (ctx.type == PIPE_SHADER_FRAGMENT) {
-		prog->num_exports = 0;
-		memset(prog->export_linkage, 0xff,
-				sizeof(prog->export_linkage));
-	}
-
-	compile_instructions(&ctx);
-
-	compile_free(&ctx);
-
-	return 0;
-}
-
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h
deleted file mode 100644
index f26bb2ffc4..0000000000
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (C) 2012 Rob Clark <robclark at freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark at freedesktop.org>
- */
-
-#ifndef FD2_COMPILER_H_
-#define FD2_COMPILER_H_
-
-#include "fd2_program.h"
-#include "fd2_util.h"
-
-int fd2_compile_shader(struct fd_program_stateobj *prog,
-		struct fd2_shader_stateobj *so);
-
-#endif /* FD2_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
index 46c76df807..fa59afb3e7 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_draw.c
@@ -236,7 +236,7 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
 		colr = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM, color->f);
 
 	/* emit generic state now: */
-	fd2_emit_state(ctx, ctx->dirty &
+	fd2_emit_state(ctx, ring, ctx->dirty &
 			(FD_DIRTY_BLEND | FD_DIRTY_VIEWPORT |
 					FD_DIRTY_FRAMEBUFFER | FD_DIRTY_SCISSOR));
 
@@ -252,7 +252,7 @@ fd2_clear(struct fd_context *ctx, unsigned buffers,
 	OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
 	OUT_RING(ring, 0x0000028f);
 
-	fd2_program_emit(ring, &ctx->solid_prog);
+	fd2_program_emit(ctx->batch, ring, &ctx->solid_prog);
 
 	OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
 	OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
index 4e93cb39b0..d87e0a360d 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_emit.c
@@ -284,11 +284,25 @@ fd2_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A2XX_PA_CL_VTE_CNTL_VPORT_Y_OFFSET_ENA |
 				A2XX_PA_CL_VTE_CNTL_VPORT_Z_SCALE_ENA |
 				A2XX_PA_CL_VTE_CNTL_VPORT_Z_OFFSET_ENA);
+
+		/* set viewport in C65/C66, for a20x hw binning and fragcoord.z */
+		OUT_PKT3(ring, CP_SET_CONSTANT, 9);
+		OUT_RING(ring, 0x00000184);
+
+		OUT_RING(ring, fui(ctx->viewport.translate[0]));
+		OUT_RING(ring, fui(ctx->viewport.translate[1]));
+		OUT_RING(ring, fui(ctx->viewport.translate[2]));
+		OUT_RING(ring, fui(0.0f));
+
+		OUT_RING(ring, fui(ctx->viewport.scale[0]));
+		OUT_RING(ring, fui(ctx->viewport.scale[1]));
+		OUT_RING(ring, fui(ctx->viewport.scale[2]));
+		OUT_RING(ring, fui(0.0f));
 	}
 
 	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_VTXSTATE | FD_DIRTY_TEXSTATE)) {
 		fd2_program_validate(ctx);
-		fd2_program_emit(ring, &ctx->prog);
+		fd2_program_emit(ctx->batch, ring, &ctx->prog);
 	}
 
 	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONST)) {
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
index b6b7050026..c5609628d0 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_gmem.c
@@ -39,6 +39,7 @@
 #include "fd2_program.h"
 #include "fd2_util.h"
 #include "fd2_zsa.h"
+#include "instr-a2xx.h"
 
 static uint32_t fmt2swap(enum pipe_format format)
 {
@@ -135,7 +136,7 @@ fd2_emit_tile_gmem2mem(struct fd_batch *batch, struct fd_tile *tile)
 	OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
 	OUT_RING(ring, 0x0000028f);
 
-	fd2_program_emit(ring, &ctx->solid_prog);
+	fd2_program_emit(batch, ring, &ctx->solid_prog);
 
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_PA_SC_AA_MASK));
@@ -282,7 +283,7 @@ fd2_emit_tile_mem2gmem(struct fd_batch *batch, struct fd_tile *tile)
 	OUT_RING(ring, CP_REG(REG_A2XX_VGT_VERTEX_REUSE_BLOCK_CNTL));
 	OUT_RING(ring, 0x0000003b);
 
-	fd2_program_emit(ring, &ctx->blit_prog[0]);
+	fd2_program_emit(batch, ring, &ctx->blit_prog[0]);
 
 	OUT_PKT0(ring, REG_A2XX_TC_CNTL_STATUS, 1);
 	OUT_RING(ring, A2XX_TC_CNTL_STATUS_L2_INVALIDATE);
@@ -522,6 +523,16 @@ fd2_emit_tile_renderprep(struct fd_batch *batch, struct fd_tile *tile)
 	OUT_RING(ring, A2XX_PA_SC_WINDOW_OFFSET_X(-tile->xoff) |
 			A2XX_PA_SC_WINDOW_OFFSET_Y(-tile->yoff));
 
+	/* parameters for fragcoord in fragment shader (C64.xy)
+	 * TODO: invert Y here instead of letting NIR do it
+	 */
+	OUT_PKT3(ring, CP_SET_CONSTANT, 5);
+	OUT_RING(ring, 0x00000580);
+	OUT_RING(ring, fui(1.0f));
+	OUT_RING(ring, fui(1.0f));
+	OUT_RING(ring, fui(tile->xoff + 0.5f));
+	OUT_RING(ring, fui(tile->yoff + 0.5f));
+
 	if (is_a20x(ctx->screen) && !(fd_mesa_debug & FD_DBG_NOBIN)) {
 		struct fd_vsc_pipe *pipe = &ctx->vsc_pipe[tile->p];
 
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.c b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
index 9a35e8f904..fc576160c5 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.c
@@ -22,6 +22,7 @@
  *
  * Authors:
  *    Rob Clark <robclark at freedesktop.org>
+ *    Jonathan Marek <jonathan at marek.ca>
  */
 
 #include "pipe/p_state.h"
@@ -34,10 +35,11 @@
 
 #include "freedreno_program.h"
 
+#include "ir2.h"
 #include "fd2_program.h"
-#include "fd2_compiler.h"
 #include "fd2_texture.h"
 #include "fd2_util.h"
+#include "instr-a2xx.h"
 
 static struct fd2_shader_stateobj *
 create_shader(enum shader_t type)
@@ -54,88 +56,75 @@ delete_shader(struct fd2_shader_stateobj *so)
 {
 	if (!so)
 		return;
-	ir2_shader_destroy(so->ir);
-	free(so->tokens);
-	free(so->bin);
+	ralloc_free(so->nir);
+	free(so->info[0].dwords);
+	free(so->info[1].dwords);
 	free(so);
 }
 
-static struct fd2_shader_stateobj *
-assemble(struct fd2_shader_stateobj *so)
+static void
+emit(struct fd_batch *batch, struct fd_ringbuffer *ring,
+		struct fd2_shader_stateobj *so)
 {
-	free(so->bin);
-	so->bin = ir2_shader_assemble(so->ir, &so->info);
-	if (!so->bin)
-		goto fail;
+	bool binning = ring == batch->binning;
+	bool a20x_binning = binning && is_a20x(batch->ctx->screen);
+	struct ir2_shader_info *info;
+	unsigned i;
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("disassemble: type=%d", so->type);
-		disasm_a2xx(so->bin, so->info.sizedwords, 0, so->type);
-	}
+	info = &so->info[binning];
 
-	return so;
+	assert(info->sizedwords);
 
-fail:
-	debug_error("assemble failed!");
-	delete_shader(so);
-	return NULL;
+	OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + info->sizedwords);
+	OUT_RING(ring, (so->type == SHADER_VERTEX) ? 0 : 1);
+	OUT_RING(ring, info->sizedwords);
+	for (i = 0; i < info->sizedwords; i++) {
+		if (a20x_binning && i == info->export32_offset)
+			OUT_RINGP(ring, info->dwords[i], &batch->draw_patches);
+		else
+			OUT_RING(ring, info->dwords[i]);
+	}
 }
 
-static struct fd2_shader_stateobj *
-compile(struct fd_program_stateobj *prog, struct fd2_shader_stateobj *so)
+static int
+ir2_glsl_type_size(const struct glsl_type *type)
+{
+	return glsl_count_attribute_slots(type, false);
+}
+
+static void *
+fd2_fp_state_create(struct pipe_context *pctx,
+		const struct pipe_shader_state *cso)
 {
-	int ret;
+	struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
+	if (!so)
+		return NULL;
 
-	if (fd_mesa_debug & FD_DBG_DISASM) {
-		DBG("dump tgsi: type=%d", so->type);
-		tgsi_dump(so->tokens, 0);
+	if (cso->type == PIPE_SHADER_IR_NIR) {
+		so->nir = cso->ir.nir;
+		NIR_PASS_V(so->nir, nir_lower_io, nir_var_all, ir2_glsl_type_size,
+			   (nir_lower_io_options)0);
+	} else {
+		assert(cso->type == PIPE_SHADER_IR_TGSI);
+		so->nir = ir2_tgsi_to_nir(cso->tokens);
 	}
 
-	ret = fd2_compile_shader(prog, so);
-	if (ret)
+	if (ir2_optimize_nir(so->nir, true))
 		goto fail;
 
-	/* NOTE: we don't assemble yet because for VS we don't know the
-	 * type information for vertex fetch yet.. so those need to be
-	 * patched up later before assembling.
-	 */
+	so->first_immediate = so->nir->num_uniforms;
 
-	so->info.sizedwords = 0;
+	ir2_compile(so, 0);
 
+	ralloc_free(so->nir);
+	so->nir = NULL;
 	return so;
 
 fail:
-	debug_error("compile failed!");
 	delete_shader(so);
 	return NULL;
 }
 
-static void
-emit(struct fd_ringbuffer *ring, struct fd2_shader_stateobj *so)
-{
-	unsigned i;
-
-	if (so->info.sizedwords == 0)
-		assemble(so);
-
-	OUT_PKT3(ring, CP_IM_LOAD_IMMEDIATE, 2 + so->info.sizedwords);
-	OUT_RING(ring, (so->type == SHADER_VERTEX) ? 0 : 1);
-	OUT_RING(ring, so->info.sizedwords);
-	for (i = 0; i < so->info.sizedwords; i++)
-		OUT_RING(ring, so->bin[i]);
-}
-
-static void *
-fd2_fp_state_create(struct pipe_context *pctx,
-		const struct pipe_shader_state *cso)
-{
-	struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
-	if (!so)
-		return NULL;
-	so->tokens = tgsi_dup_tokens(cso->tokens);
-	return so;
-}
-
 static void
 fd2_fp_state_delete(struct pipe_context *pctx, void *hwcso)
 {
@@ -150,8 +139,27 @@ fd2_vp_state_create(struct pipe_context *pctx,
 	struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
 	if (!so)
 		return NULL;
-	so->tokens = tgsi_dup_tokens(cso->tokens);
+
+	if (cso->type == PIPE_SHADER_IR_NIR) {
+		so->nir = cso->ir.nir;
+		NIR_PASS_V(so->nir, nir_lower_io, nir_var_all, ir2_glsl_type_size,
+			   (nir_lower_io_options)0);
+	} else {
+		assert(cso->type == PIPE_SHADER_IR_TGSI);
+		so->nir = ir2_tgsi_to_nir(cso->tokens);
+	}
+
+	if (ir2_optimize_nir(so->nir, true))
+		goto fail;
+
+	so->first_immediate = so->nir->num_uniforms;
+
+	/* can't compile the vertex shader here, as it depends on fs */
 	return so;
+
+fail:
+	delete_shader(so);
+	return NULL;
 }
 
 static void
@@ -162,77 +170,56 @@ fd2_vp_state_delete(struct pipe_context *pctx, void *hwcso)
 }
 
 static void
-patch_vtx_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
-		struct fd_vertex_stateobj *vtx)
+patch_vtx_fetch(struct fd_context *ctx, struct pipe_vertex_element *elem,
+	instr_fetch_vtx_t *instr, uint16_t dst_swiz)
 {
-	unsigned i;
-
-	assert(so->num_vfetch_instrs == vtx->num_elements);
-
-	/* update vtx fetch instructions: */
-	for (i = 0; i < so->num_vfetch_instrs; i++) {
-		struct ir2_instruction *instr = so->vfetch_instrs[i];
-		struct pipe_vertex_element *elem = &vtx->pipe[i];
-		struct pipe_vertex_buffer *vb =
+	struct pipe_vertex_buffer *vb =
 				&ctx->vtx.vertexbuf.vb[elem->vertex_buffer_index];
-		enum pipe_format format = elem->src_format;
-		const struct util_format_description *desc =
-				util_format_description(format);
-		unsigned j;
-
-		/* Find the first non-VOID channel. */
-		for (j = 0; j < 4; j++)
-			if (desc->channel[j].type != UTIL_FORMAT_TYPE_VOID)
-				break;
-
-		/* CI/CIS can probably be set in compiler instead: */
-		instr->fetch.const_idx = 20 + (i / 3);
-		instr->fetch.const_idx_sel = i % 3;
-
-		instr->fetch.fmt = fd2_pipe2surface(format);
-		instr->fetch.is_normalized = desc->channel[j].normalized;
-		instr->fetch.is_signed =
-				desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED;
-		instr->fetch.stride = vb->stride ? : 1;
-		instr->fetch.offset = elem->src_offset;
-
-		for (j = 0; j < 4; j++)
-			instr->dst_reg.swizzle[j] = "xyzw01__"[desc->swizzle[j]];
-
-		assert(instr->fetch.fmt != ~0);
-
-		DBG("vtx[%d]: %s (%d), ci=%d, cis=%d, id=%d, swizzle=%s, "
-				"stride=%d, offset=%d",
-				i, util_format_name(format),
-				instr->fetch.fmt,
-				instr->fetch.const_idx,
-				instr->fetch.const_idx_sel,
-				elem->instance_divisor,
-				instr->dst_reg.swizzle,
-				instr->fetch.stride,
-				instr->fetch.offset);
+	enum pipe_format format = elem->src_format;
+	const struct util_format_description *desc =
+			util_format_description(format);
+	unsigned j;
+
+	/* Find the first non-VOID channel. */
+	for (j = 0; j < 4; j++)
+		if (desc->channel[j].type != UTIL_FORMAT_TYPE_VOID)
+			break;
+
+	instr->format = fd2_pipe2surface(format);
+	instr->num_format_all = !desc->channel[j].normalized;
+	instr->format_comp_all = desc->channel[j].type == UTIL_FORMAT_TYPE_SIGNED;
+	instr->stride = vb->stride;
+	instr->offset = elem->src_offset;
+
+	unsigned swiz = 0;
+	for (int i = 0; i < 4; i++) {
+		unsigned s = dst_swiz >> i*3 & 7;
+		swiz |= (s >= 4 ? s : desc->swizzle[s]) << i*3;
 	}
-
-	/* trigger re-assemble: */
-	so->info.sizedwords = 0;
+	instr->dst_swiz = swiz;
 }
 
 static void
-patch_tex_fetches(struct fd_context *ctx, struct fd2_shader_stateobj *so,
-		struct fd_texture_stateobj *tex)
+patch_fetches(struct fd_context *ctx, struct ir2_shader_info *info,
+	struct fd_vertex_stateobj *vtx, struct fd_texture_stateobj *tex)
 {
-	unsigned i;
-
-	/* update tex fetch instructions: */
-	for (i = 0; i < so->num_tfetch_instrs; i++) {
-		struct ir2_instruction *instr = so->tfetch_instrs[i].instr;
-		unsigned samp_id = so->tfetch_instrs[i].samp_id;
-		unsigned const_idx = fd2_get_const_idx(ctx, tex, samp_id);
+	for (int i = 0; i < info->num_fetch_instrs; i++) {
+		struct ir2_fetch_info *fi = &info->fetch_info[i];
+
+		instr_fetch_t *instr = (instr_fetch_t*) &info->dwords[fi->offset];
+		if (instr->opc == VTX_FETCH) {
+			unsigned idx = (instr->vtx.const_index - 20) * 3 +
+				instr->vtx.const_index_sel;
+			patch_vtx_fetch(ctx, &vtx->pipe[idx], &instr->vtx, fi->vtx.dst_swiz);
+			continue;
+		}
 
-		if (const_idx != instr->fetch.const_idx) {
-			instr->fetch.const_idx = const_idx;
-			/* trigger re-assemble: */
-			so->info.sizedwords = 0;
+		assert(instr->opc == TEX_FETCH);
+		instr->tex.const_idx = fd2_get_const_idx(ctx, tex, fi->tex.samp_id);
+		instr->tex.src_swiz = fi->tex.src_swiz;
+		if (fd2_texture_swap_xy(tex, fi->tex.samp_id)) {
+			unsigned x = instr->tex.src_swiz;
+			instr->tex.src_swiz = (x & 0x30) | (x & 3) << 2 | (x >> 2 & 3);
 		}
 	}
 }
@@ -241,198 +228,74 @@ void
 fd2_program_validate(struct fd_context *ctx)
 {
 	struct fd_program_stateobj *prog = &ctx->prog;
-	bool dirty_fp = !!(ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & FD_DIRTY_SHADER_PROG);
-	bool dirty_vp = !!(ctx->dirty_shader[PIPE_SHADER_VERTEX] & FD_DIRTY_SHADER_PROG);
-
-	/* if vertex or frag shader is dirty, we may need to recompile. Compile
-	 * frag shader first, as that assigns the register slots for exports
-	 * from the vertex shader.  And therefore if frag shader has changed we
-	 * need to recompile both vert and frag shader.
-	 */
-	if (dirty_fp)
-		compile(prog, prog->fp);
-
-	if (dirty_fp || dirty_vp)
-		compile(prog, prog->vp);
-
-	/* if necessary, fix up vertex fetch instructions: */
-	if (ctx->dirty & (FD_DIRTY_VTXSTATE | FD_DIRTY_PROG))
-		patch_vtx_fetches(ctx, prog->vp, ctx->vtx.vtx);
-
-	/* if necessary, fix up texture fetch instructions: */
-	if (ctx->dirty & (FD_DIRTY_TEXSTATE | FD_DIRTY_PROG)) {
-		patch_tex_fetches(ctx, prog->vp, &ctx->tex[PIPE_SHADER_VERTEX]);
-		patch_tex_fetches(ctx, prog->fp, &ctx->tex[PIPE_SHADER_FRAGMENT]);
+	struct fd2_shader_stateobj *fp = prog->fp, *vp = prog->vp;
+
+	/* recompile vertex shader when fragment shader changes */
+	if (!vp->info[0].sizedwords || memcmp(&fp->f, &vp->f, sizeof(fp->f))) {
+		vp->f = fp->f;
+		ir2_compile(vp, 0);
+		ir2_compile(vp, 1);
 	}
+
+	/* patch fetch instructions */
+	patch_fetches(ctx, &vp->info[0], ctx->vtx.vtx, &ctx->tex[PIPE_SHADER_VERTEX]);
+	patch_fetches(ctx, &vp->info[1], ctx->vtx.vtx, &ctx->tex[PIPE_SHADER_VERTEX]);
+	patch_fetches(ctx, &fp->info[0], NULL, &ctx->tex[PIPE_SHADER_FRAGMENT]);
 }
 
 void
-fd2_program_emit(struct fd_ringbuffer *ring,
+fd2_program_emit(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		struct fd_program_stateobj *prog)
 {
-	struct ir2_shader_info *vsi =
-		&((struct fd2_shader_stateobj *)prog->vp)->info;
-	struct ir2_shader_info *fsi =
-		&((struct fd2_shader_stateobj *)prog->fp)->info;
-	uint8_t vs_gprs, fs_gprs, vs_export;
+	struct fd2_shader_stateobj *fp, *vp;
+	uint8_t vs_gprs, fs_gprs = 0, vs_export = 0;
+	bool binning = (ring == batch->binning);
+
+	vp = prog->vp;
+	fp = prog->fp;
 
-	emit(ring, prog->vp);
-	emit(ring, prog->fp);
+	emit(batch, ring, vp);
 
-	vs_gprs = (vsi->max_reg < 0) ? 0x80 : vsi->max_reg;
-	fs_gprs = (fsi->max_reg < 0) ? 0x80 : fsi->max_reg;
-	vs_export = MAX2(1, prog->num_exports) - 1;
+	if (!binning) {
+		emit(batch, ring, fp);
+		fs_gprs = (fp->info[0].max_reg < 0) ? 0x80 : fp->info[0].max_reg;
+		vs_export = MAX2(1, fp->f.inputs_count) - 1;
+	}
+
+	vs_gprs = (vp->info[binning].max_reg < 0) ? 0x80 : vp->info[binning].max_reg;
+
+	enum a2xx_sq_ps_vtx_mode mode = POSITION_1_VECTOR;
+	if (vp->writes_psize && !binning)
+		mode = POSITION_2_VECTORS_SPRITE;
+
+	/* set register to use for param (fragcoord/pointcoord/frontfacing) */
+	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
+	OUT_RING(ring, CP_REG(REG_A2XX_SQ_CONTEXT_MISC));
+	OUT_RING(ring, A2XX_SQ_CONTEXT_MISC_SC_SAMPLE_CNTL(CENTERS_ONLY) |
+		A2XX_SQ_CONTEXT_MISC_PARAM_GEN_POS(fp->f.inputs_count) |
+		/* we need SCREEN_XY for both fragcoord and frontfacing */
+		A2XX_SQ_CONTEXT_MISC_SC_OUTPUT_SCREEN_XY);
 
 	OUT_PKT3(ring, CP_SET_CONSTANT, 2);
 	OUT_RING(ring, CP_REG(REG_A2XX_SQ_PROGRAM_CNTL));
-	OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(POSITION_2_VECTORS_SPRITE) |
+	OUT_RING(ring, A2XX_SQ_PROGRAM_CNTL_PS_EXPORT_MODE(2) |
+			A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_MODE(mode) |
 			A2XX_SQ_PROGRAM_CNTL_VS_RESOURCE |
 			A2XX_SQ_PROGRAM_CNTL_PS_RESOURCE |
 			A2XX_SQ_PROGRAM_CNTL_VS_EXPORT_COUNT(vs_export) |
 			A2XX_SQ_PROGRAM_CNTL_PS_REGS(fs_gprs) |
-			A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs));
-}
-
-/* Creates shader:
- *    EXEC ADDR(0x2) CNT(0x1)
- *       (S)FETCH:	SAMPLE	R0.xyzw = R0.xyx CONST(0) LOCATION(CENTER)
- *    ALLOC PARAM/PIXEL SIZE(0x0)
- *    EXEC_END ADDR(0x3) CNT(0x1)
- *          ALU:	MAXv	export0 = R0, R0	; gl_FragColor
- *    NOP
- */
-static struct fd2_shader_stateobj *
-create_blit_fp(void)
-{
-	struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
-	struct ir2_instruction *instr;
-
-	if (!so)
-		return NULL;
-
-	so->ir = ir2_shader_create();
-
-	instr = ir2_instr_create_tex_fetch(so->ir, 0);
-	ir2_dst_create(instr, 0, "xyzw", 0);
-	ir2_reg_create(instr, 0, "xyx", IR2_REG_INPUT);
-	instr->sync = true;
-
-	instr = ir2_instr_create_alu_v(so->ir, MAXv);
-	ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
-	ir2_reg_create(instr, 0, NULL, 0);
-	ir2_reg_create(instr, 0, NULL, 0);
-
-	return assemble(so);
-}
-
-/* Creates shader:
-*     EXEC ADDR(0x3) CNT(0x2)
-*           FETCH:	VERTEX	R1.xy01 = R0.x FMT_32_32_FLOAT UNSIGNED STRIDE(8) CONST(26, 1)
-*           FETCH:	VERTEX	R2.xyz1 = R0.x FMT_32_32_32_FLOAT UNSIGNED STRIDE(12) CONST(26, 0)
-*     ALLOC POSITION SIZE(0x0)
-*     EXEC ADDR(0x5) CNT(0x1)
-*           ALU:	MAXv	export62 = R2, R2	; gl_Position
-*     ALLOC PARAM/PIXEL SIZE(0x0)
-*     EXEC_END ADDR(0x6) CNT(0x1)
-*           ALU:	MAXv	export0 = R1, R1
-*     NOP
- */
-static struct fd2_shader_stateobj *
-create_blit_vp(void)
-{
-	struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
-	struct ir2_instruction *instr;
-
-	if (!so)
-		return NULL;
-
-	so->ir = ir2_shader_create();
-
-	instr = ir2_instr_create_vtx_fetch(so->ir, 26, 1, FMT_32_32_FLOAT, false, 8);
-	instr->fetch.is_normalized = true;
-	ir2_dst_create(instr, 1, "xy01", 0);
-	ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-	instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
-	instr->fetch.is_normalized = true;
-	ir2_dst_create(instr, 2, "xyz1", 0);
-	ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-	instr = ir2_instr_create_alu_v(so->ir, MAXv);
-	ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
-	ir2_reg_create(instr, 2, NULL, 0);
-	ir2_reg_create(instr, 2, NULL, 0);
-
-	instr = ir2_instr_create_alu_v(so->ir, MAXv);
-	ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
-	ir2_reg_create(instr, 1, NULL, 0);
-	ir2_reg_create(instr, 1, NULL, 0);
-
-	return assemble(so);
-}
-
-/* Creates shader:
- *    ALLOC PARAM/PIXEL SIZE(0x0)
- *    EXEC_END ADDR(0x1) CNT(0x1)
- *          ALU:	MAXv	export0 = C0, C0	; gl_FragColor
- */
-static struct fd2_shader_stateobj *
-create_solid_fp(void)
-{
-	struct fd2_shader_stateobj *so = create_shader(SHADER_FRAGMENT);
-	struct ir2_instruction *instr;
-
-	if (!so)
-		return NULL;
-
-	so->ir = ir2_shader_create();
-
-	instr = ir2_instr_create_alu_v(so->ir, MAXv);
-	ir2_dst_create(instr, 0, NULL, IR2_REG_EXPORT);
-	ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
-	ir2_reg_create(instr, 0, NULL, IR2_REG_CONST);
-
-	return assemble(so);
-}
-
-/* Creates shader:
- *    EXEC ADDR(0x3) CNT(0x1)
- *       (S)FETCH:	VERTEX	R1.xyz1 = R0.x FMT_32_32_32_FLOAT
- *                           UNSIGNED STRIDE(12) CONST(26, 0)
- *    ALLOC POSITION SIZE(0x0)
- *    EXEC ADDR(0x4) CNT(0x1)
- *          ALU:	MAXv	export62 = R1, R1	; gl_Position
- *    ALLOC PARAM/PIXEL SIZE(0x0)
- *    EXEC_END ADDR(0x5) CNT(0x0)
- */
-static struct fd2_shader_stateobj *
-create_solid_vp(void)
-{
-	struct fd2_shader_stateobj *so = create_shader(SHADER_VERTEX);
-	struct ir2_instruction *instr;
-
-	if (!so)
-		return NULL;
-
-	so->ir = ir2_shader_create();
-
-	instr = ir2_instr_create_vtx_fetch(so->ir, 26, 0, FMT_32_32_32_FLOAT, false, 12);
-	ir2_dst_create(instr, 1, "xyz1", 0);
-	ir2_reg_create(instr, 0, "x", IR2_REG_INPUT);
-
-	instr = ir2_instr_create_alu_v(so->ir, MAXv);
-	ir2_dst_create(instr, 62, NULL, IR2_REG_EXPORT);
-	ir2_reg_create(instr, 1, NULL, 0);
-	ir2_reg_create(instr, 1, NULL, 0);
-
-
-	return assemble(so);
+			A2XX_SQ_PROGRAM_CNTL_VS_REGS(vs_gprs) |
+			COND(fp->need_param, A2XX_SQ_PROGRAM_CNTL_PARAM_GEN) |
+			COND(binning, A2XX_SQ_PROGRAM_CNTL_GEN_INDEX_VTX));
 }
 
 void
 fd2_prog_init(struct pipe_context *pctx)
 {
 	struct fd_context *ctx = fd_context(pctx);
+	struct fd_program_stateobj *prog;
+	struct fd2_shader_stateobj *so;
+	instr_fetch_vtx_t *instr;
 
 	pctx->create_fs_state = fd2_fp_state_create;
 	pctx->delete_fs_state = fd2_fp_state_delete;
@@ -442,8 +305,45 @@ fd2_prog_init(struct pipe_context *pctx)
 
 	fd_prog_init(pctx);
 
-	ctx->solid_prog.fp = create_solid_fp();
-	ctx->solid_prog.vp = create_solid_vp();
-	ctx->blit_prog[0].fp = create_blit_fp();
-	ctx->blit_prog[0].vp = create_blit_vp();
+	/* XXX maybe its possible to reuse patch_vtx_fetch somehow? */
+
+	prog = &ctx->solid_prog;
+	so = prog->vp;
+	so->f = ((struct fd2_shader_stateobj*) prog->fp)->f;
+	ir2_compile(prog->vp, 0);
+
+#define IR2_FETCH_SWIZ_XY01 0xb08
+#define IR2_FETCH_SWIZ_XYZ1 0xa88
+
+	instr = (instr_fetch_vtx_t*) &so->info[0].dwords[so->info[0].fetch_info[0].offset];
+	instr->const_index = 26;
+	instr->const_index_sel = 0;
+	instr->format = FMT_32_32_32_FLOAT;
+	instr->format_comp_all = false;
+	instr->stride = 12;
+	instr->num_format_all = true;
+	instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1;
+
+	prog = &ctx->blit_prog[0];
+	so = prog->vp;
+	so->f = ((struct fd2_shader_stateobj*) prog->fp)->f;
+	ir2_compile(prog->vp, 0);
+
+	instr = (instr_fetch_vtx_t*) &so->info[0].dwords[so->info[0].fetch_info[0].offset];
+	instr->const_index = 26;
+	instr->const_index_sel = 1;
+	instr->format = FMT_32_32_FLOAT;
+	instr->format_comp_all = false;
+	instr->stride = 8;
+	instr->num_format_all = false;
+	instr->dst_swiz = IR2_FETCH_SWIZ_XY01;
+
+	instr = (instr_fetch_vtx_t*) &so->info[0].dwords[so->info[0].fetch_info[1].offset];
+	instr->const_index = 26;
+	instr->const_index_sel = 0;
+	instr->format = FMT_32_32_32_FLOAT;
+	instr->format_comp_all = false;
+	instr->stride = 12;
+	instr->num_format_all = false;
+	instr->dst_swiz = IR2_FETCH_SWIZ_XYZ1;
 }
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_program.h b/src/gallium/drivers/freedreno/a2xx/fd2_program.h
index d2df829e07..c23c7a2c24 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_program.h
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_program.h
@@ -31,46 +31,39 @@
 
 #include "freedreno_context.h"
 
-#include "ir-a2xx.h"
+#include "ir2.h"
 #include "disasm.h"
 
 struct fd2_shader_stateobj {
+	nir_shader *nir;
 	enum shader_t type;
 
-	uint32_t *bin;
-
-	struct tgsi_token *tokens;
-
-	/* note that we defer compiling shader until we know both vs and ps..
-	 * and if one changes, we potentially need to recompile in order to
-	 * get varying linkages correct:
-	 */
-	struct ir2_shader_info info;
-	struct ir2_shader *ir;
-
-	/* for vertex shaders, the fetch instructions which need to be
-	 * patched up before assembly:
-	 */
-	unsigned num_vfetch_instrs;
-	struct ir2_instruction *vfetch_instrs[64];
-
-	/* for all shaders, any tex fetch instructions which need to be
-	 * patched before assembly:
-	 */
-	unsigned num_tfetch_instrs;
-	struct {
-		unsigned samp_id;
-		struct ir2_instruction *instr;
-	} tfetch_instrs[64];
+	struct ir2_shader_info info[2];
 
 	unsigned first_immediate;     /* const reg # of first immediate */
 	unsigned num_immediates;
 	struct {
 		uint32_t val[4];
+		unsigned ncomp;
 	} immediates[64];
+
+	bool writes_psize;
+	bool need_param;
+
+	/* fragment shader info (vertex shader has copy) */
+	struct {
+		unsigned inputs_count;
+		struct {
+			uint8_t slot;
+			uint8_t ncomp;
+		} inputs[16];
+
+		/* driver_location of fragcoord.zw, -1 if not used */
+		int fragcoord;
+	} f;
 };
 
-void fd2_program_emit(struct fd_ringbuffer *ring,
+void fd2_program_emit(struct fd_batch *batch, struct fd_ringbuffer *ring,
 		struct fd_program_stateobj *prog);
 void fd2_program_validate(struct fd_context *ctx);
 
diff --git a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
index 5a9f93ec79..2591062ee3 100644
--- a/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
+++ b/src/gallium/drivers/freedreno/a2xx/instr-a2xx.h
@@ -87,6 +87,7 @@ typedef enum {
 	SIN = 48,
 	COS = 49,
 	RETAIN_PREV = 50,
+	SCALAR_NONE = 63,
 } instr_scalar_opc_t;
 
 typedef enum {
@@ -120,6 +121,7 @@ typedef enum {
 	KILLNEv = 27,
 	DSTv = 28,
 	MOVAv = 29,
+	VECTOR_NONE = 31,
 } instr_vector_opc_t;
 
 typedef struct PACKED {
@@ -161,9 +163,9 @@ typedef struct PACKED {
 		};
 		/* constants have full 8-bit index */
 		struct {
-			uint8_t             src3_reg_const   : 8;
-			uint8_t             src2_reg_const   : 8;
-			uint8_t             src1_reg_const   : 8;
+			uint8_t             src3_reg_byte    : 8;
+			uint8_t             src2_reg_byte    : 8;
+			uint8_t             src1_reg_byte    : 8;
 		};
 	};
 	instr_vector_opc_t  vector_opc               : 5;
@@ -389,10 +391,17 @@ typedef union PACKED {
 		instr_fetch_opc_t opc                    : 5;
 		uint32_t        dummy0                   : 27;
 		/* dword1: */
-		uint32_t        dummy1                   : 32;
+		uint32_t        dummy1                   : 31;
+		uint8_t         pred_select              : 1;
 		/* dword2: */
-		uint32_t        dummy2                   : 32;
+		uint32_t        dummy2                   : 31;
+		uint8_t         pred_condition           : 1;
 	};
 } instr_fetch_t;
 
+typedef union PACKED {
+	instr_alu_t alu;
+	instr_fetch_t fetch;
+} instr_t;
+
 #endif /* INSTR_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
deleted file mode 100644
index af9811864f..0000000000
--- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/*
- * Copyright (c) 2012 Rob Clark <robdclark at gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "ir-a2xx.h"
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-#include "freedreno_util.h"
-#include "instr-a2xx.h"
-
-#define DEBUG_MSG(f, ...)  do { if (0) DBG(f, ##__VA_ARGS__); } while (0)
-#define WARN_MSG(f, ...)   DBG("WARN:  "f, ##__VA_ARGS__)
-#define ERROR_MSG(f, ...)  DBG("ERROR: "f, ##__VA_ARGS__)
-
-static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
-		uint32_t idx, struct ir2_shader_info *info);
-
-static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n);
-static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg);
-static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg);
-static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg);
-
-/* simple allocator to carve allocations out of an up-front allocated heap,
- * so that we can free everything easily in one shot.
- */
-static void * ir2_alloc(struct ir2_shader *shader, int sz)
-{
-	void *ptr = &shader->heap[shader->heap_idx];
-	shader->heap_idx += align(sz, 4) / 4;
-	return ptr;
-}
-
-static char * ir2_strdup(struct ir2_shader *shader, const char *str)
-{
-	char *ptr = NULL;
-	if (str) {
-		int len = strlen(str);
-		ptr = ir2_alloc(shader, len+1);
-		memcpy(ptr, str, len);
-		ptr[len] = '\0';
-	}
-	return ptr;
-}
-
-struct ir2_shader * ir2_shader_create(void)
-{
-	DEBUG_MSG("");
-	struct ir2_shader *shader = calloc(1, sizeof(struct ir2_shader));
-	shader->max_reg = -1;
-	return shader;
-}
-
-void ir2_shader_destroy(struct ir2_shader *shader)
-{
-	DEBUG_MSG("");
-	free(shader);
-}
-
-/* check if an instruction is a simple MOV
- */
-static struct ir2_instruction * simple_mov(struct ir2_instruction *instr,
-		bool output)
-{
-    struct ir2_src_register *src_reg = instr->src_reg;
-    struct ir2_dst_register *dst_reg = &instr->dst_reg;
-    struct ir2_register *reg;
-    unsigned i;
-
-    /* MAXv used for MOV */
-    if (instr->instr_type != IR2_ALU_VECTOR ||
-		instr->alu_vector.opc != MAXv)
-		return NULL;
-
-	/* non identical srcs */
-	if (src_reg[0].num != src_reg[1].num)
-		return NULL;
-
-	/* flags */
-	int flags = IR2_REG_NEGATE | IR2_REG_ABS;
-	if (output)
-		flags |= IR2_REG_INPUT | IR2_REG_CONST;
-	if ((src_reg[0].flags & flags) || (src_reg[1].flags & flags))
-		return NULL;
-
-	/* clamping */
-	if (instr->alu_vector.clamp)
-		return NULL;
-
-	/* swizzling */
-    for (i = 0; i < 4; i++) {
-		char swiz = (dst_reg->swizzle ? dst_reg->swizzle : "xyzw")[i];
-		if (swiz == '_')
-			continue;
-
-		if (swiz != (src_reg[0].swizzle ? src_reg[0].swizzle : "xyzw")[i] ||
-			swiz != (src_reg[1].swizzle ? src_reg[1].swizzle : "xyzw")[i])
-			return NULL;
-    }
-
-    if (output)
-		reg = &instr->shader->reg[src_reg[0].num];
-	else
-		reg = &instr->shader->reg[dst_reg->num];
-
-	assert(reg->write_idx >= 0);
-    if (reg->write_idx != reg->write_idx2)
-		return NULL;
-
-	if (!output)
-		return instr;
-
-	instr = instr->shader->instr[reg->write_idx];
-	return instr->instr_type != IR2_ALU_VECTOR ? NULL : instr;
-}
-
-static int src_to_reg(struct ir2_instruction *instr,
-		struct ir2_src_register *reg)
-{
-	if (reg->flags & IR2_REG_CONST)
-		return reg->num;
-
-	return instr->shader->reg[reg->num].reg;
-}
-
-static int dst_to_reg(struct ir2_instruction *instr,
-		struct ir2_dst_register *reg)
-{
-	if (reg->flags & IR2_REG_EXPORT)
-		return reg->num;
-
-	return instr->shader->reg[reg->num].reg;
-}
-
-static bool mask_get(uint32_t *mask, unsigned index)
-{
-    return !!(mask[index / 32] & 1 << index % 32);
-}
-
-static void mask_set(uint32_t *mask, struct ir2_register *reg, int index)
-{
-	if (reg) {
-		unsigned i;
-		for (i = 0; i < ARRAY_SIZE(reg->regmask); i++)
-			mask[i] |= reg->regmask[i];
-	}
-	if (index >= 0)
-		mask[index / 32] |= 1 << index % 32;
-}
-
-static bool sets_pred(struct ir2_instruction *instr)
-{
-    return instr->instr_type == IR2_ALU_SCALAR &&
-		instr->alu_scalar.opc >= PRED_SETEs &&
-		instr->alu_scalar.opc <= PRED_SET_RESTOREs;
-}
-
-
-
-void* ir2_shader_assemble(struct ir2_shader *shader,
-		struct ir2_shader_info *info)
-{
-	/* NOTES
-	 * blob compiler seems to always puts PRED_* instrs in a CF by
-	 * themselves, and wont combine EQ/NE in the same CF
-	 * (not doing this - doesn't seem to make a difference)
-	 *
-	 * TODO: implement scheduling for combining vector+scalar instructions
-	 * -some vector instructions can be replaced by scalar
-	 */
-
-	/* first step:
-	 * 1. remove "NOP" MOV instructions generated by TGSI for input/output:
-	 * 2. track information for register allocation, and to remove
-	 * the dead code when some exports are not needed
-	 * 3. add additional instructions for a20x hw binning if needed
-	 * NOTE: modifies the shader instrs
-	 * this step could be done as instructions are added by compiler instead
-	 */
-
-	/* mask of exports that must be generated
-	 * used to avoid calculating ps exports with hw binning
-	*/
-	uint64_t export = ~0ull;
-	/* bitmask of variables required for exports defined by "export" */
-	uint32_t export_mask[REG_MASK/32+1] = {};
-
-	unsigned idx, reg_idx;
-	unsigned max_input = 0;
-	int export_size = -1;
-
-	for (idx = 0; idx < shader->instr_count; idx++) {
-		struct ir2_instruction *instr = shader->instr[idx], *prev;
-		struct ir2_dst_register dst_reg = instr->dst_reg;
-
-		if (dst_reg.flags & IR2_REG_EXPORT) {
-			if (dst_reg.num < 32)
-				export_size++;
-
-			if ((prev = simple_mov(instr, true))) {
-				/* copy instruction but keep dst */
-				*instr = *prev;
-				instr->dst_reg = dst_reg;
-			}
-		}
-
-		for (reg_idx = 0; reg_idx < instr->src_reg_count; reg_idx++) {
-			struct ir2_src_register *src_reg = &instr->src_reg[reg_idx];
-			struct ir2_register *reg;
-			int num;
-
-			if (src_reg->flags & IR2_REG_CONST)
-				continue;
-
-			num = src_reg->num;
-			reg = &shader->reg[num];
-			reg->read_idx = idx;
-
-			if (src_reg->flags & IR2_REG_INPUT) {
-				max_input = MAX2(max_input, num);
-			} else {
-				/* bypass simple mov used to set src_reg */
-				assert(reg->write_idx >= 0);
-				prev = shader->instr[reg->write_idx];
-				if (simple_mov(prev, false)) {
-					*src_reg = prev->src_reg[0];
-					/* process same src_reg again */
-					reg_idx -= 1;
-					continue;
-				}
-			}
-
-			/* update dependencies */
-			uint32_t *mask = (dst_reg.flags & IR2_REG_EXPORT) ?
-					export_mask : shader->reg[dst_reg.num].regmask;
-			mask_set(mask, reg, num);
-			if (sets_pred(instr))
-				mask_set(export_mask, reg, num);
-		}
-	}
-
-	/* second step:
-	 * emit instructions (with CFs) + RA
-	 */
-	instr_cf_t cfs[128], *cf = cfs;
-	uint32_t alufetch[3*256], *af = alufetch;
-
-	/* RA is done on write, so inputs must be allocated here */
-	for (reg_idx = 0; reg_idx <= max_input; reg_idx++)
-		shader->reg[reg_idx].reg = reg_idx;
-	info->max_reg = max_input;
-
-	/* CF instr state */
-	instr_cf_exec_t exec = { .opc = EXEC };
-	instr_cf_alloc_t alloc = { .opc = ALLOC };
-	bool need_alloc = 0;
-	bool pos_export = 0;
-
-	export_size = MAX2(export_size, 0);
-
-	for (idx = 0; idx < shader->instr_count; idx++) {
-		struct ir2_instruction *instr = shader->instr[idx];
-		struct ir2_dst_register *dst_reg = &instr->dst_reg;
-		unsigned num = dst_reg->num;
-		struct ir2_register *reg;
-
-		/* a2xx only has 64 registers, so we can use a single 64-bit mask */
-		uint64_t regmask = 0ull;
-
-		/* compute the current regmask */
-		for (reg_idx = 0; (int) reg_idx <= shader->max_reg; reg_idx++) {
-			reg = &shader->reg[reg_idx];
-			if ((int) idx > reg->write_idx && idx < reg->read_idx)
-				regmask |= (1ull << reg->reg);
-		}
-
-		if (dst_reg->flags & IR2_REG_EXPORT) {
-			/* skip if export is not needed */
-			if (!(export & (1ull << num)))
-				continue;
-
-            /* ALLOC CF:
-             * want to alloc all < 32 at once
-			 * 32/33 and 62/63 come in pairs
-			 * XXX assuming all 3 types are never interleaved
-			 */
-            if (num < 32) {
-				alloc.size = export_size;
-				alloc.buffer_select = SQ_PARAMETER_PIXEL;
-				need_alloc = export_size >= 0;
-				export_size = -1;
-			} else if (num == 32 || num == 33) {
-				alloc.size = 0;
-				alloc.buffer_select = SQ_MEMORY;
-				need_alloc = num != 33;
-			} else {
-				alloc.size = 0;
-				alloc.buffer_select = SQ_POSITION;
-				need_alloc = !pos_export;
-				pos_export = true;
-			}
-
-		} else {
-			/* skip if dst register not needed to compute exports */
-			if (!mask_get(export_mask, num))
-				continue;
-
-			/* RA on first write */
-			reg = &shader->reg[num];
-			if (reg->write_idx == idx) {
-				reg->reg = ffsll(~regmask) - 1;
-				info->max_reg = MAX2(info->max_reg, reg->reg);
-			}
-		}
-
-		if (exec.count == 6 || (exec.count && need_alloc)) {
-			*cf++ = *(instr_cf_t*) &exec;
-			exec.address += exec.count;
-			exec.serialize = 0;
-			exec.count = 0;
-		}
-
-		if (need_alloc) {
-			*cf++ = *(instr_cf_t*) &alloc;
-			need_alloc = false;
-		}
-
-		int ret = instr_emit(instr, af, idx, info); af += 3;
-		assert(!ret);
-
-		if (instr->instr_type == IR2_FETCH)
-			exec.serialize |= 0x1 << exec.count * 2;
-		if (instr->sync)
-			exec.serialize |= 0x2 << exec.count * 2;
-		 exec.count += 1;
-	}
-
-
-	exec.opc = !export_size ? EXEC : EXEC_END;
-	*cf++ = *(instr_cf_t*) &exec;
-	exec.address += exec.count;
-	exec.serialize = 0;
-	exec.count = 0;
-
-	/* GPU will hang without at least one pixel alloc */
-	if (!export_size) {
-		alloc.size = 0;
-		alloc.buffer_select = SQ_PARAMETER_PIXEL;
-		*cf++ = *(instr_cf_t*) &alloc;
-
-		exec.opc = EXEC_END;
-		*cf++ = *(instr_cf_t*) &exec;
-	}
-
-	unsigned num_cfs = cf - cfs;
-
-	/* insert nop to get an even # of CFs */
-	if (num_cfs % 2) {
-		*cf++ = (instr_cf_t) { .opc = NOP };
-		num_cfs++;
-	}
-
-	/* offset cf addrs */
-	for (idx = 0; idx < num_cfs; idx++) {
-        switch (cfs[idx].opc) {
-		case EXEC:
-		case EXEC_END:
-			cfs[idx].exec.address += num_cfs / 2;
-			break;
-		default:
-			break;
-		/* XXX  and any other address using cf that gets implemented */
-		}
-	}
-
-	/* concatenate cfs+alufetchs */
-	uint32_t cfdwords = num_cfs / 2 * 3;
-	uint32_t alufetchdwords = exec.address * 3;
-	info->sizedwords = cfdwords + alufetchdwords;
-	uint32_t *dwords = malloc(info->sizedwords * 4);
-	assert(dwords);
-	memcpy(dwords, cfs, cfdwords * 4);
-	memcpy(&dwords[cfdwords], alufetch, alufetchdwords * 4);
-	return dwords;
-}
-
-struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
-		int instr_type)
-{
-	struct ir2_instruction *instr =
-			ir2_alloc(shader, sizeof(struct ir2_instruction));
-	DEBUG_MSG("%d", instr_type);
-	instr->shader = shader;
-	instr->idx = shader->instr_count;
-	instr->pred = shader->pred;
-	instr->instr_type = instr_type;
-	shader->instr[shader->instr_count++] = instr;
-	return instr;
-}
-
-
-/*
- * FETCH instructions:
- */
-
-static int instr_emit_fetch(struct ir2_instruction *instr,
-		uint32_t *dwords, uint32_t idx,
-		struct ir2_shader_info *info)
-{
-	instr_fetch_t *fetch = (instr_fetch_t *)dwords;
-	struct ir2_dst_register *dst_reg = &instr->dst_reg;
-	struct ir2_src_register *src_reg = &instr->src_reg[0];
-
-	memset(fetch, 0, sizeof(*fetch));
-
-	fetch->opc = instr->fetch.opc;
-
-	if (instr->fetch.opc == VTX_FETCH) {
-		instr_fetch_vtx_t *vtx = &fetch->vtx;
-
-		assert(instr->fetch.stride <= 0xff);
-		assert(instr->fetch.fmt <= 0x3f);
-		assert(instr->fetch.const_idx <= 0x1f);
-		assert(instr->fetch.const_idx_sel <= 0x3);
-
-		vtx->src_reg = src_to_reg(instr, src_reg);
-		vtx->src_swiz = reg_fetch_src_swiz(src_reg, 1);
-		vtx->dst_reg = dst_to_reg(instr, dst_reg);
-		vtx->dst_swiz = reg_fetch_dst_swiz(dst_reg);
-		vtx->must_be_one = 1;
-		vtx->const_index = instr->fetch.const_idx;
-		vtx->const_index_sel = instr->fetch.const_idx_sel;
-		vtx->format_comp_all = !!instr->fetch.is_signed;
-		vtx->num_format_all = !instr->fetch.is_normalized;
-		vtx->format = instr->fetch.fmt;
-		vtx->stride = instr->fetch.stride;
-		vtx->offset = instr->fetch.offset;
-
-		if (instr->pred != IR2_PRED_NONE) {
-			vtx->pred_select = 1;
-			vtx->pred_condition = (instr->pred == IR2_PRED_EQ) ? 1 : 0;
-		}
-
-		/* XXX seems like every FETCH but the first has
-		 * this bit set:
-		 */
-		vtx->reserved3 = (idx > 0) ? 0x1 : 0x0;
-		vtx->reserved0 = (idx > 0) ? 0x2 : 0x3;
-	} else if (instr->fetch.opc == TEX_FETCH) {
-		instr_fetch_tex_t *tex = &fetch->tex;
-
-		assert(instr->fetch.const_idx <= 0x1f);
-
-		tex->src_reg = src_to_reg(instr, src_reg);
-		tex->src_swiz = reg_fetch_src_swiz(src_reg, 3);
-		tex->dst_reg = dst_to_reg(instr, dst_reg);
-		tex->dst_swiz = reg_fetch_dst_swiz(dst_reg);
-		tex->const_idx = instr->fetch.const_idx;
-		tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
-		tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
-		tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
-		tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
-		tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
-		tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
-		tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
-		tex->use_comp_lod = 1;
-		tex->use_reg_lod = !instr->fetch.is_cube;
-		tex->sample_location = SAMPLE_CENTER;
-		tex->tx_coord_denorm = instr->fetch.is_rect;
-
-		if (instr->pred != IR2_PRED_NONE) {
-			tex->pred_select = 1;
-			tex->pred_condition = (instr->pred == IR2_PRED_EQ) ? 1 : 0;
-		}
-
-	} else {
-		ERROR_MSG("invalid fetch opc: %d\n", instr->fetch.opc);
-		return -1;
-	}
-
-	return 0;
-}
-
-/*
- * ALU instructions:
- */
-
-static int instr_emit_alu(struct ir2_instruction *instr_v,
-		struct ir2_instruction *instr_s, uint32_t *dwords,
-		struct ir2_shader_info *info)
-{
-	instr_alu_t *alu = (instr_alu_t *)dwords;
-	struct ir2_dst_register *vdst_reg, *sdst_reg;
-	struct ir2_src_register *src1_reg, *src2_reg, *src3_reg;
-	struct ir2_shader *shader = instr_v ? instr_v->shader : instr_s->shader;
-	enum ir2_pred pred = IR2_PRED_NONE;
-
-	memset(alu, 0, sizeof(*alu));
-
-	vdst_reg = NULL;
-	sdst_reg = NULL;
-	src1_reg = NULL;
-	src2_reg = NULL;
-	src3_reg = NULL;
-
-	if (instr_v) {
-		vdst_reg = &instr_v->dst_reg;
-		assert(instr_v->src_reg_count >= 2);
-		src1_reg = &instr_v->src_reg[0];
-		src2_reg = &instr_v->src_reg[1];
-		if (instr_v->src_reg_count > 2)
-			src3_reg = &instr_v->src_reg[2];
-		pred = instr_v->pred;
-	}
-
-	if (instr_s) {
-		sdst_reg = &instr_s->dst_reg;
-		assert(instr_s->src_reg_count == 1);
-		assert(!instr_v || vdst_reg->flags == sdst_reg->flags);
-		assert(!instr_v || pred == instr_s->pred);
-		if (src3_reg) {
-			assert(src3_reg->flags == instr_s->src_reg[0].flags);
-			assert(src3_reg->num == instr_s->src_reg[0].num);
-			assert(!strcmp(src3_reg->swizzle, instr_s->src_reg[0].swizzle));
-		}
-		src3_reg = &instr_s->src_reg[0];
-		pred = instr_s->pred;
-	}
-
-	if (vdst_reg) {
-		assert((vdst_reg->flags & ~IR2_REG_EXPORT) == 0);
-		assert(!vdst_reg->swizzle || (strlen(vdst_reg->swizzle) == 4));
-		alu->vector_opc          = instr_v->alu_vector.opc;
-		alu->vector_write_mask   = reg_alu_dst_swiz(vdst_reg);
-		alu->vector_dest         = dst_to_reg(instr_v, vdst_reg);
-	} else {
-		alu->vector_opc          = MAXv;
-	}
-
-	if (sdst_reg) {
-		alu->scalar_opc          = instr_s->alu_scalar.opc;
-		alu->scalar_write_mask   = reg_alu_dst_swiz(sdst_reg);
-		alu->scalar_dest         = dst_to_reg(instr_s, sdst_reg);
-	} else {
-		/* not sure if this is required, but adreno compiler seems
-		 * to always set scalar opc to MAXs if it is not used:
-		 */
-		alu->scalar_opc = MAXs;
-	}
-
-	alu->export_data =
-		!!((instr_v ? vdst_reg : sdst_reg)->flags & IR2_REG_EXPORT);
-
-	/* export32 has this bit set.. it seems to do more than just set
-	 * the base address of the constants used to zero
-	 * TODO make this less of a hack
-	 */
-	if (alu->export_data && alu->vector_dest == 32) {
-		assert(!instr_s);
-		alu->relative_addr = 1;
-	}
-
-	if (src1_reg) {
-		if (src1_reg->flags & IR2_REG_CONST) {
-			assert(!(src1_reg->flags & IR2_REG_ABS));
-			alu->src1_reg_const  = src1_reg->num;
-		} else {
-			alu->src1_reg        = shader->reg[src1_reg->num].reg;
-			alu->src1_reg_abs    = !!(src1_reg->flags & IR2_REG_ABS);
-		}
-		alu->src1_swiz           = reg_alu_src_swiz(src1_reg);
-		alu->src1_reg_negate     = !!(src1_reg->flags & IR2_REG_NEGATE);
-		alu->src1_sel            = !(src1_reg->flags & IR2_REG_CONST);
-    }  else {
-		alu->src1_sel = 1;
-	}
-
-    if (src2_reg) {
-		if (src2_reg->flags & IR2_REG_CONST) {
-			assert(!(src2_reg->flags & IR2_REG_ABS));
-			alu->src2_reg_const  = src2_reg->num;
-		} else {
-			alu->src2_reg        = shader->reg[src2_reg->num].reg;
-			alu->src2_reg_abs    = !!(src2_reg->flags & IR2_REG_ABS);
-		}
-		alu->src2_swiz           = reg_alu_src_swiz(src2_reg);
-		alu->src2_reg_negate     = !!(src2_reg->flags & IR2_REG_NEGATE);
-		alu->src2_sel            = !(src2_reg->flags & IR2_REG_CONST);
-    } else {
-		alu->src2_sel = 1;
-    }
-
-    if (src3_reg) {
-		if (src3_reg->flags & IR2_REG_CONST) {
-			assert(!(src3_reg->flags & IR2_REG_ABS));
-			alu->src3_reg_const  = src3_reg->num;
-		} else {
-			alu->src3_reg        = shader->reg[src3_reg->num].reg;
-			alu->src3_reg_abs    = !!(src3_reg->flags & IR2_REG_ABS);
-		}
-		alu->src3_swiz           = reg_alu_src_swiz(src3_reg);
-		alu->src3_reg_negate     = !!(src3_reg->flags & IR2_REG_NEGATE);
-		alu->src3_sel            = !(src3_reg->flags & IR2_REG_CONST);
-	} else {
-		/* not sure if this is required, but adreno compiler seems
-		 * to always set register bank for 3rd src if unused:
-		 */
-		alu->src3_sel = 1;
-	}
-
-	alu->vector_clamp = instr_v ? instr_v->alu_vector.clamp : 0;
-	alu->scalar_clamp = instr_s ? instr_s->alu_scalar.clamp : 0;
-
-	if (pred != IR2_PRED_NONE)
-		alu->pred_select = (pred == IR2_PRED_EQ) ? 3 : 2;
-
-	return 0;
-}
-
-static int instr_emit(struct ir2_instruction *instr, uint32_t *dwords,
-		uint32_t idx, struct ir2_shader_info *info)
-{
-	switch (instr->instr_type) {
-	case IR2_FETCH: return instr_emit_fetch(instr, dwords, idx, info);
-	case IR2_ALU_VECTOR: return instr_emit_alu(instr, NULL, dwords, info);
-	case IR2_ALU_SCALAR: return instr_emit_alu(NULL, instr, dwords, info);
-	}
-	return -1;
-}
-
-struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
-		int num, const char *swizzle, int flags)
-{
-	if (!(flags & IR2_REG_EXPORT)) {
-		struct ir2_register *reg = &instr->shader->reg[num];
-
-		unsigned i;
-		for (i = instr->shader->max_reg + 1; i <= num; i++)
-			instr->shader->reg[i].write_idx = -1;
-		instr->shader->max_reg = i - 1;
-
-		if (reg->write_idx < 0)
-            reg->write_idx = instr->idx;
-		reg->write_idx2 = instr->idx;
-	}
-
-	struct ir2_dst_register *reg = &instr->dst_reg;
-	reg->flags = flags;
-	reg->num = num;
-	reg->swizzle = ir2_strdup(instr->shader, swizzle);
-	return reg;
-}
-
-struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
-		int num, const char *swizzle, int flags)
-{
-	assert(instr->src_reg_count + 1 <= ARRAY_SIZE(instr->src_reg));
-	if (!(flags & IR2_REG_CONST)) {
-		struct ir2_register *reg = &instr->shader->reg[num];
-
-		reg->read_idx = instr->idx;
-
-		unsigned i;
-		for (i = instr->shader->max_reg + 1; i <= num; i++)
-			instr->shader->reg[i].write_idx = -1;
-		instr->shader->max_reg = i - 1;
-	}
-
-	struct ir2_src_register *reg = &instr->src_reg[instr->src_reg_count++];
-	reg->flags = flags;
-	reg->num = num;
-	reg->swizzle = ir2_strdup(instr->shader, swizzle);
-	return reg;
-}
-
-static uint32_t reg_fetch_src_swiz(struct ir2_src_register *reg, uint32_t n)
-{
-	uint32_t swiz = 0;
-	int i;
-
-	assert((reg->flags & ~IR2_REG_INPUT) == 0);
-	assert(reg->swizzle);
-
-	DEBUG_MSG("fetch src R%d.%s", reg->num, reg->swizzle);
-
-	for (i = n-1; i >= 0; i--) {
-		swiz <<= 2;
-		switch (reg->swizzle[i]) {
-		default:
-			ERROR_MSG("invalid fetch src swizzle: %s", reg->swizzle);
-		case 'x': swiz |= 0x0; break;
-		case 'y': swiz |= 0x1; break;
-		case 'z': swiz |= 0x2; break;
-		case 'w': swiz |= 0x3; break;
-		}
-	}
-
-	return swiz;
-}
-
-static uint32_t reg_fetch_dst_swiz(struct ir2_dst_register *reg)
-{
-	uint32_t swiz = 0;
-	int i;
-
-	assert(reg->flags == 0);
-	assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
-
-	DEBUG_MSG("fetch dst R%d.%s", reg->num, reg->swizzle);
-
-	if (reg->swizzle) {
-		for (i = 3; i >= 0; i--) {
-			swiz <<= 3;
-			switch (reg->swizzle[i]) {
-			default:
-				ERROR_MSG("invalid dst swizzle: %s", reg->swizzle);
-			case 'x': swiz |= 0x0; break;
-			case 'y': swiz |= 0x1; break;
-			case 'z': swiz |= 0x2; break;
-			case 'w': swiz |= 0x3; break;
-			case '0': swiz |= 0x4; break;
-			case '1': swiz |= 0x5; break;
-			case '_': swiz |= 0x7; break;
-			}
-		}
-	} else {
-		swiz = 0x688;
-	}
-
-	return swiz;
-}
-
-/* actually, a write-mask */
-static uint32_t reg_alu_dst_swiz(struct ir2_dst_register *reg)
-{
-	uint32_t swiz = 0;
-	int i;
-
-	assert((reg->flags & ~IR2_REG_EXPORT) == 0);
-	assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
-
-	DEBUG_MSG("alu dst R%d.%s", reg->num, reg->swizzle);
-
-	if (reg->swizzle) {
-		for (i = 3; i >= 0; i--) {
-			swiz <<= 1;
-			if (reg->swizzle[i] == "xyzw"[i]) {
-				swiz |= 0x1;
-			} else if (reg->swizzle[i] != '_') {
-				ERROR_MSG("invalid dst swizzle: %s", reg->swizzle);
-				break;
-			}
-		}
-	} else {
-		swiz = 0xf;
-	}
-
-	return swiz;
-}
-
-static uint32_t reg_alu_src_swiz(struct ir2_src_register *reg)
-{
-	uint32_t swiz = 0;
-	int i;
-
-	assert(!reg->swizzle || (strlen(reg->swizzle) == 4));
-
-	DEBUG_MSG("vector src R%d.%s", reg->num, reg->swizzle);
-
-	if (reg->swizzle) {
-		for (i = 3; i >= 0; i--) {
-			swiz <<= 2;
-			switch (reg->swizzle[i]) {
-			default:
-				ERROR_MSG("invalid vector src swizzle: %s", reg->swizzle);
-			case 'x': swiz |= (0x0 - i) & 0x3; break;
-			case 'y': swiz |= (0x1 - i) & 0x3; break;
-			case 'z': swiz |= (0x2 - i) & 0x3; break;
-			case 'w': swiz |= (0x3 - i) & 0x3; break;
-			}
-		}
-	} else {
-		swiz = 0x0;
-	}
-
-	return swiz;
-}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h b/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
deleted file mode 100644
index ac2931266d..0000000000
--- a/src/gallium/drivers/freedreno/a2xx/ir-a2xx.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2012 Rob Clark <robdclark at gmail.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef IR2_H_
-#define IR2_H_
-
-#include <stdint.h>
-#include <stdbool.h>
-
-#include "instr-a2xx.h"
-
-/* low level intermediate representation of an adreno a2xx shader program */
-
-struct ir2_shader;
-
-#define REG_MASK 0xff
-
-struct ir2_shader_info {
-	uint16_t sizedwords;
-	int8_t   max_reg;   /* highest GPR # used by shader */
-};
-
-struct ir2_register {
-	int16_t write_idx, write_idx2, read_idx, reg;
-	/* bitmask of variables on which this one depends
-	 * XXX: use bitmask util?
-	 */
-	uint32_t regmask[REG_MASK/32+1];
-};
-
-struct ir2_src_register {
-	enum {
-		IR2_REG_INPUT  = 0x1,
-		IR2_REG_CONST  = 0x2,
-		IR2_REG_NEGATE = 0x4,
-		IR2_REG_ABS    = 0x8,
-	} flags;
-	int num;
-	char *swizzle;
-};
-
-struct ir2_dst_register {
-	enum {
-		IR2_REG_EXPORT = 0x1,
-	} flags;
-	int num;
-	char *swizzle;
-};
-
-enum ir2_pred {
-	IR2_PRED_NONE = 0,
-	IR2_PRED_EQ = 1,
-	IR2_PRED_NE = 2,
-};
-
-struct ir2_instruction {
-	struct ir2_shader *shader;
-	unsigned idx;
-	enum {
-		IR2_FETCH,
-		IR2_ALU_VECTOR,
-		IR2_ALU_SCALAR,
-	} instr_type;
-	enum ir2_pred pred;
-	int sync;
-	unsigned src_reg_count;
-	struct ir2_dst_register dst_reg;
-	struct ir2_src_register src_reg[3];
-	union {
-		/* FETCH specific: */
-		struct {
-			instr_fetch_opc_t opc;
-			unsigned const_idx;
-			/* texture fetch specific: */
-			bool is_cube : 1;
-			bool is_rect : 1;
-			/* vertex fetch specific: */
-			unsigned const_idx_sel;
-			enum a2xx_sq_surfaceformat fmt;
-			bool is_signed : 1;
-			bool is_normalized : 1;
-			uint32_t stride;
-			uint32_t offset;
-		} fetch;
-		/* ALU-Vector specific: */
-		struct {
-			instr_vector_opc_t opc;
-			bool clamp;
-		} alu_vector;
-		/* ALU-Scalar specific: */
-		struct {
-			instr_scalar_opc_t opc;
-			bool clamp;
-		} alu_scalar;
-	};
-};
-
-struct ir2_shader {
-	unsigned instr_count;
-	int max_reg;
-	struct ir2_register reg[REG_MASK+1];
-
-	struct ir2_instruction *instr[0x200];
-	uint32_t heap[100 * 4096];
-	unsigned heap_idx;
-
-	enum ir2_pred pred;  /* pred inherited by newly created instrs */
-};
-
-struct ir2_shader * ir2_shader_create(void);
-void ir2_shader_destroy(struct ir2_shader *shader);
-void * ir2_shader_assemble(struct ir2_shader *shader,
-		struct ir2_shader_info *info);
-
-struct ir2_instruction * ir2_instr_create(struct ir2_shader *shader,
-		int instr_type);
-
-struct ir2_dst_register * ir2_dst_create(struct ir2_instruction *instr,
-		int num, const char *swizzle, int flags);
-struct ir2_src_register * ir2_reg_create(struct ir2_instruction *instr,
-		int num, const char *swizzle, int flags);
-
-/* some helper fxns: */
-
-static inline struct ir2_instruction *
-ir2_instr_create_alu_v(struct ir2_shader *shader, instr_vector_opc_t vop)
-{
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_VECTOR);
-	if (!instr)
-		return instr;
-	instr->alu_vector.opc = vop;
-	return instr;
-}
-
-static inline struct ir2_instruction *
-ir2_instr_create_alu_s(struct ir2_shader *shader, instr_scalar_opc_t sop)
-{
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_ALU_SCALAR);
-	if (!instr)
-		return instr;
-	instr->alu_scalar.opc = sop;
-	return instr;
-}
-
-static inline struct ir2_instruction *
-ir2_instr_create_vtx_fetch(struct ir2_shader *shader, int ci, int cis,
-		enum a2xx_sq_surfaceformat fmt, bool is_signed, int stride)
-{
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
-	instr->fetch.opc = VTX_FETCH;
-	instr->fetch.const_idx = ci;
-	instr->fetch.const_idx_sel = cis;
-	instr->fetch.fmt = fmt;
-	instr->fetch.is_signed = is_signed;
-	instr->fetch.stride = stride;
-	return instr;
-}
-static inline struct ir2_instruction *
-ir2_instr_create_tex_fetch(struct ir2_shader *shader, int ci)
-{
-	struct ir2_instruction *instr = ir2_instr_create(shader, IR2_FETCH);
-	instr->fetch.opc = TEX_FETCH;
-	instr->fetch.const_idx = ci;
-	return instr;
-}
-
-
-#endif /* IR2_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.c b/src/gallium/drivers/freedreno/a2xx/ir2.c
new file mode 100644
index 0000000000..e0769ec013
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+#include "ir2_private.h"
+
+static bool scalar_possible(struct ir2_instr *instr)
+{
+	if (instr->alu.scalar_opc == SCALAR_NONE)
+		return false;
+
+	return src_ncomp(instr) == 1;
+}
+
+static bool is_alu_compatible(struct ir2_instr *a, struct ir2_instr *b)
+{
+	if (!a)
+		return true;
+
+	if (a == b)
+		return false;
+
+	/* there seems to be issues when an instr writes to different exports */
+	return a->alu.export == b->alu.export;
+}
+
+/* priority of vector instruction for scheduling (lower=higher prio) */
+static unsigned alu_vector_prio(struct ir2_instr *instr)
+{
+	if (instr->alu.vector_opc == VECTOR_NONE)
+		return ~0u;
+
+	if (is_export(instr))
+		return 4;
+
+	/* TODO check src type and ncomps */
+	if (instr->src_count == 3)
+		return 0;
+
+	if (!scalar_possible(instr))
+		return 1;
+
+	return instr->src_count == 2 ? 2 : 3;
+}
+
+/* priority of scalar instruction for scheduling (lower=higher prio) */
+static unsigned alu_scalar_prio(struct ir2_instr *instr)
+{
+	if (!scalar_possible(instr))
+		return ~0u;
+
+	/* this case is dealt with later */
+	if (instr->src_count > 1)
+		return ~0u;
+
+	if (is_export(instr))
+		return 4;
+
+	/* scalar only have highest priority */
+	return instr->alu.vector_opc == VECTOR_NONE ? 0 : 3;
+}
+
+/* this is a bit messy:
+ * we want to find a slot where we can insert a scalar MOV with
+ * a vector instruction that was already scheduled
+ */
+static struct ir2_sched_instr*
+insert(struct ir2_context *ctx, unsigned block_idx, unsigned reg_idx,
+	struct ir2_src src1, unsigned *comp)
+{
+	struct ir2_sched_instr *sched = NULL, *s;
+	unsigned i, mask = 0xf;
+
+	/* go first earliest point where the mov can be inserted */
+	for (i = ctx->instr_sched_count-1; i > 0; i--) {
+		s = &ctx->instr_sched[i - 1];
+
+		if (s->instr && s->instr->block_idx != block_idx)
+			break;
+		if (s->instr_s && s->instr_s->block_idx != block_idx)
+			break;
+
+		if (src1.type == IR2_SRC_SSA) {
+			if ((s->instr && s->instr->idx == src1.num) ||
+				(s->instr_s && s->instr_s->idx == src1.num))
+				break;
+		}
+
+		unsigned mr = ~(s->reg_state[reg_idx/8] >> reg_idx%8*4 & 0xf);
+		if ((mask & mr) == 0)
+			break;
+
+		mask &= mr;
+		if (s->instr_s || s->instr->src_count == 3)
+			continue;
+
+		if (s->instr->type != IR2_ALU || s->instr->alu.export >= 0)
+			continue;
+
+		sched = s;
+	}
+	*comp = ffs(mask) - 1;
+	return sched;
+}
+
+/* case1:
+ * in this case, insert a mov to place the 2nd src into to same reg
+ * (scalar sources come from the same register)
+ *
+ * this is a common case which works when one of the srcs is input/const
+ * but for instrs which have 2 ssa/reg srcs, then its not ideal
+ */
+static bool
+scalarize_case1(struct ir2_context *ctx, struct ir2_instr *instr, bool order)
+{
+	struct ir2_src src0 = instr->src[ order];
+	struct ir2_src src1 = instr->src[!order];
+	struct ir2_sched_instr *sched;
+	struct ir2_instr *ins;
+	struct ir2_reg *reg;
+	unsigned idx, comp;
+
+	switch (src0.type) {
+	case IR2_SRC_CONST:
+	case IR2_SRC_INPUT:
+		return false;
+	default:
+		break;
+	}
+
+	/* TODO, insert needs logic for this */
+	if (src1.type == IR2_SRC_REG)
+		return false;
+
+	/* we could do something if they match src1.. */
+	if (src0.negate || src0.abs)
+		return false;
+
+	reg = get_reg_src(ctx, &src0);
+
+	/* result not used more since we will overwrite */
+	for (int i = 0; i < 4; i++)
+		if (reg->comp[i].ref_count != !!(instr->alu.write_mask & 1 << i))
+			return false;
+
+	/* find a place to insert the mov */
+	sched = insert(ctx, instr->block_idx, reg->idx, src1, &comp);
+	if (!sched)
+		return false;
+
+	ins = &ctx->instr[idx = ctx->instr_count++];
+	ins->idx = idx;
+	ins->type = IR2_ALU;
+	ins->src[0] = src1;
+	ins->src_count = 1;
+	ins->is_ssa = true;
+	ins->ssa.idx = reg->idx;
+	ins->ssa.ncomp = 1;
+	ins->ssa.comp[0].c = comp;
+	ins->alu.scalar_opc = MAXs;
+	ins->alu.export = -1;
+	ins->alu.write_mask = 1;
+	ins->pred = instr->pred;
+	ins->block_idx = instr->block_idx;
+
+	instr->src[0] = src0;
+	instr->alu.src1_swizzle = comp;
+
+	sched->instr_s = ins;
+	return true;
+}
+
+static int fill_sched(struct ir2_context *ctx, struct ir2_sched_instr *sched)
+{
+	struct ir2_instr *avail[0x100], *instr_v = NULL, *instr_s = NULL;
+	unsigned avail_count = 0;
+
+	instr_alloc_type_t export = ~0u;
+	int block_idx = -1;
+
+	/* XXX merge this loop with the other one somehow? */
+	ir2_foreach_instr(instr, ctx) {
+		if (!instr->need_emit)
+			continue;
+		if (is_export(instr))
+			export = MIN2(export, export_buf(instr->alu.export));
+	}
+
+	ir2_foreach_instr(instr, ctx) {
+		if (!instr->need_emit)
+			continue;
+
+		/* dont mix exports */
+		if (is_export(instr) && export_buf(instr->alu.export) != export)
+			continue;
+
+		if (block_idx < 0)
+			block_idx = instr->block_idx;
+		else if (block_idx != instr->block_idx || /* must be same block */
+			instr->type == IR2_CF || /* CF/MEM must be alone */
+			(is_export(instr) && export == SQ_MEMORY))
+			break;
+		/* it works because IR2_CF is always at end of block
+		 * and somewhat same idea with MEM exports, which might not be alone
+		 * but will end up in-order at least
+		 */
+
+		/* check if dependencies are satisfied */
+		bool is_ok = true;
+		ir2_foreach_src(src, instr) {
+			if (src->type == IR2_SRC_REG) {
+				/* need to check if all previous instructions in the block
+				 * which write the reg have been emitted
+				 * slow..
+				 * XXX: check components instead of whole register
+				 */
+				struct ir2_reg *reg = get_reg_src(ctx, src);
+				ir2_foreach_instr(p, ctx) {
+					if (!p->is_ssa && p->reg == reg && p->idx < instr->idx)
+						is_ok &= !p->need_emit;
+				}
+			} else if (src->type == IR2_SRC_SSA) {
+				/* in this case its easy, just check need_emit */
+				is_ok &= !ctx->instr[src->num].need_emit;
+			}
+		}
+		if (!is_ok)
+			continue;
+
+		avail[avail_count++] = instr;
+	}
+
+	if (!avail_count) {
+		assert(block_idx == -1);
+		return -1;
+	}
+
+	/* priority to FETCH instructions */
+	ir2_foreach_avail(instr) {
+		if (instr->type == IR2_ALU)
+			continue;
+
+		ra_src_free(ctx, instr);
+		ra_reg(ctx, get_reg(instr), -1, false, 0);
+
+		instr->need_emit = false;
+		sched->instr = instr;
+		sched->instr_s = NULL;
+		return block_idx;
+	}
+
+	/* TODO precompute priorities */
+
+	unsigned prio_v = ~0u, prio_s = ~0u, prio;
+	ir2_foreach_avail(instr) {
+		prio = alu_vector_prio(instr);
+		if (prio < prio_v) {
+			instr_v = instr;
+			prio_v = prio;
+		}
+	}
+
+	/* TODO can still insert scalar if src_count=3, if smart about it */
+	if (!instr_v || instr_v->src_count < 3) {
+		ir2_foreach_avail(instr) {
+			bool compat = is_alu_compatible(instr_v, instr);
+
+			prio = alu_scalar_prio(instr);
+			if (prio >= prio_v && !compat)
+				continue;
+
+			if (prio < prio_s) {
+				instr_s = instr;
+				prio_s = prio;
+				if (!compat)
+					instr_v = NULL;
+			}
+		}
+	}
+
+	assert(instr_v || instr_s);
+
+	/* now, we try more complex insertion of vector instruction as scalar
+	 * TODO: if we are smart we can still insert if instr_v->src_count==3
+	 */
+	if (!instr_s && instr_v->src_count < 3) {
+		ir2_foreach_avail(instr) {
+			if (!is_alu_compatible(instr_v, instr) || !scalar_possible(instr))
+				continue;
+
+			/* at this point, src_count should always be 2 */
+			assert(instr->src_count == 2);
+
+			if (scalarize_case1(ctx, instr, 0)) {
+				instr_s = instr;
+				break;
+			}
+			if (scalarize_case1(ctx, instr, 1)) {
+				instr_s = instr;
+				break;
+			}
+		}
+	}
+
+	if (instr_v) {
+		instr_v->need_emit = false;
+		ra_src_free(ctx, instr_v);
+	}
+
+	if (instr_s) {
+		instr_s->need_emit = false;
+		ra_src_free(ctx, instr_s);
+	}
+
+	if (instr_v)
+		ra_reg(ctx, get_reg(instr_v), -1, is_export(instr_v), instr_v->alu.write_mask);
+
+	if (instr_s)
+		ra_reg(ctx, get_reg(instr_s), -1, is_export(instr_s), instr_s->alu.write_mask);
+
+	sched->instr = instr_v;
+	sched->instr_s = instr_s;
+	return block_idx;
+}
+
+/* scheduling: determine order of instructions */
+static void schedule_instrs(struct ir2_context *ctx)
+{
+	struct ir2_sched_instr *sched;
+	int block_idx;
+
+	/* allocate input registers */
+	for (unsigned idx = 0; idx < ARRAY_SIZE(ctx->input); idx++)
+		if (ctx->input[idx].initialized)
+			ra_reg(ctx, &ctx->input[idx], idx, false, 0);
+
+	for (;;) {
+		sched = &ctx->instr_sched[ctx->instr_sched_count++];
+		block_idx = fill_sched(ctx, sched);
+		if (block_idx < 0)
+			break;
+		memcpy(sched->reg_state, ctx->reg_state, sizeof(ctx->reg_state));
+
+		/* catch texture fetch after scheduling and insert the
+		 * SET_TEX_LOD right before it if necessary
+		 * TODO clean this up
+		 */
+		struct ir2_instr *instr = sched->instr, *instr2;
+		if (instr && instr->type == IR2_FETCH &&
+			instr->fetch.opc == TEX_FETCH && instr->src_count == 2) {
+			/* generate the SET_LOD instruction */
+			instr2 = &ctx->instr[ctx->instr_count++];
+			instr2->type = IR2_FETCH;
+			instr2->block_idx = instr->block_idx;
+			instr2->pred = instr->pred;
+			instr2->fetch.opc = TEX_SET_TEX_LOD;
+			instr2->src[0] = instr->src[1];
+			instr2->src_count = 1;
+
+			sched[1] = sched[0];
+			sched->instr = instr2;
+			ctx->instr_sched_count++;
+		}
+
+		bool free_block = true;
+		ir2_foreach_instr(instr, ctx)
+			free_block &= instr->block_idx != block_idx;
+		if (free_block)
+			ra_block_free(ctx, block_idx);
+	};
+	ctx->instr_sched_count--;
+}
+
+void ir2_compile(struct fd2_shader_stateobj *so, unsigned variant)
+{
+	struct ir2_context ctx = { };
+	struct ir2_shader_info *info = &so->info[variant];
+
+	ctx.so = so;
+	ctx.info = info;
+	info->max_reg = -1;
+
+	/* convert nir to internal representation */
+	ir2_nir_compile(&ctx, variant);
+
+	/* remove movs used for loading inputs/constants/uniforms */
+	substitutions(&ctx);
+
+	/* get ref_counts and kill non-needed instructions */
+	ra_count_refs(&ctx);
+
+	/* remove movs used to write outputs */
+	late_substitutions(&ctx);
+
+	/* instruction order.. and vector->scalar conversions */
+	schedule_instrs(&ctx);
+
+	/* finally, assemble to bitcode */
+	assemble(&ctx);
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2.h b/src/gallium/drivers/freedreno/a2xx/ir2.h
new file mode 100644
index 0000000000..a28de71902
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+#ifndef IR2_H_
+#define IR2_H_
+
+#include "compiler/nir/nir.h"
+
+struct ir2_fetch_info {
+	/* dword offset of the fetch instruction */
+	uint16_t offset;
+	union {
+		/* swizzle to merge with tgsi swizzle */
+		struct {
+			uint16_t dst_swiz;
+		} vtx;
+		/* sampler id to patch const_idx */
+		struct {
+			uint16_t samp_id;
+			uint8_t src_swiz;
+		} tex;
+	};
+};
+
+struct ir2_shader_info {
+	/* compiler shader */
+	uint32_t *dwords;
+
+	/* size of the compiled shader in dwords */
+	uint16_t sizedwords;
+
+	/* highest GPR # used by shader */
+	int8_t max_reg;
+
+	/* offset in dwords of first MEMORY export CF (for a20x hw binning) */
+	int16_t export32_offset;
+
+	/* fetch instruction info for patching */
+	uint16_t num_fetch_instrs;
+	struct ir2_fetch_info fetch_info[64];
+};
+
+struct fd2_shader_stateobj;
+struct fd_program_stateobj;
+struct tgsi_token;
+
+void ir2_compile(struct fd2_shader_stateobj *so, unsigned variant);
+
+struct nir_shader *ir2_tgsi_to_nir(const struct tgsi_token *tokens);
+
+const nir_shader_compiler_options *ir2_get_compiler_options(void);
+
+int ir2_optimize_nir(nir_shader *s, bool lower);
+
+#endif							/* IR2_H_ */
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c
new file mode 100644
index 0000000000..c6e9590b3c
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_assemble.c
@@ -0,0 +1,546 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+#include "ir2_private.h"
+
+static unsigned
+src_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
+{
+	struct ir2_reg_component *comps;
+	unsigned swiz = 0;
+
+	switch (src->type) {
+	case IR2_SRC_SSA:
+	case IR2_SRC_REG:
+		break;
+	default:
+		return src->swizzle;
+	}
+	/* we need to take into account where the components were allocated */
+	comps = get_reg_src(ctx, src)->comp;
+	for (int i = 0; i < ncomp; i++) {
+        swiz |= swiz_set(comps[swiz_get(src->swizzle, i)].c, i);
+	}
+	return swiz;
+}
+
+/* alu instr need to take into how the output components are allocated */
+
+/* scalar doesn't need to take into account dest swizzle */
+
+static unsigned
+alu_swizzle_scalar(struct ir2_context *ctx, struct ir2_src *reg)
+{
+	/* hardware seems to take from W, but swizzle everywhere just in case */
+	return swiz_merge(src_swizzle(ctx, reg, 1), IR2_SWIZZLE_XXXX);
+}
+
+static unsigned
+alu_swizzle(struct ir2_context *ctx, struct ir2_instr *instr, struct ir2_src *src)
+{
+	struct ir2_reg_component *comp = get_reg(instr)->comp;
+	unsigned swiz0 = src_swizzle(ctx, src, src_ncomp(instr));
+	unsigned swiz = 0;
+
+	/* non per component special cases */
+	switch (instr->alu.vector_opc) {
+	case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
+		return alu_swizzle_scalar(ctx, src);
+	case DOT2ADDv:
+	case DOT3v:
+	case DOT4v:
+	case CUBEv:
+		return swiz0;
+	default:
+		break;
+	}
+
+	for (int i = 0, j = 0; i < dst_ncomp(instr); j++) {
+		if (instr->alu.write_mask & 1 << j) {
+			if (comp[j].c != 7)
+				swiz |= swiz_set(i, comp[j].c);
+			i++;
+		}
+	}
+	return swiz_merge(swiz0, swiz);
+}
+
+static unsigned
+alu_swizzle_scalar2(struct ir2_context *ctx, struct ir2_src *src, unsigned s1)
+{
+	/* hardware seems to take from ZW, but swizzle everywhere (ABAB) */
+	unsigned s0 = swiz_get(src_swizzle(ctx, src, 1), 0);
+	return swiz_merge(swiz_set(s0, 0) | swiz_set(s1, 1), IR2_SWIZZLE_XYXY);
+}
+
+/* write_mask needs to be transformed by allocation information */
+
+static unsigned
+alu_write_mask(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+	struct ir2_reg_component *comp = get_reg(instr)->comp;
+	unsigned write_mask = 0;
+
+	for (int i = 0; i < 4; i++) {
+		if (instr->alu.write_mask & 1 << i)
+			write_mask |= 1 << comp[i].c;
+	}
+
+	return write_mask;
+}
+
+/* fetch instructions can swizzle dest, but src swizzle needs conversion */
+
+static unsigned
+fetch_swizzle(struct ir2_context *ctx, struct ir2_src *src, unsigned ncomp)
+{
+	unsigned alu_swiz = src_swizzle(ctx, src, ncomp);
+	unsigned swiz = 0;
+	for (int i = 0; i < ncomp; i++)
+		swiz |= swiz_get(alu_swiz, i) << i * 2;
+	return swiz;
+}
+
+static unsigned
+fetch_dst_swiz(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+	struct ir2_reg_component *comp = get_reg(instr)->comp;
+	unsigned dst_swiz = 0xfff;
+	for (int i = 0; i < dst_ncomp(instr); i++) {
+		dst_swiz &= ~(7 << comp[i].c * 3);
+		dst_swiz |= i << comp[i].c * 3;
+	}
+	return dst_swiz;
+}
+
+/* register / export # for instr */
+static unsigned
+dst_to_reg(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+	if (is_export(instr))
+		return instr->alu.export;
+
+	return get_reg(instr)->idx;
+}
+
+/* register # for src */
+static unsigned src_to_reg(struct ir2_context *ctx, struct ir2_src *src)
+{
+	return get_reg_src(ctx, src)->idx;
+}
+
+static unsigned src_reg_byte(struct ir2_context *ctx, struct ir2_src *src)
+{
+	if (src->type == IR2_SRC_CONST) {
+		assert(!src->abs); /* no abs bit for const */
+		return src->num;
+	}
+	return src_to_reg(ctx, src) | (src->abs ? 0x80 : 0);
+}
+
+/* produce the 12 byte binary instruction for a given sched_instr */
+static void
+fill_instr(struct ir2_context *ctx, struct ir2_sched_instr *sched,
+		   instr_t *bc, bool * is_fetch)
+{
+	struct ir2_instr *instr = sched->instr, *instr_s, *instr_v;
+
+	*bc = (instr_t) {};
+
+	if (instr && instr->type == IR2_FETCH) {
+		*is_fetch = true;
+
+		bc->fetch.opc = instr->fetch.opc;
+		bc->fetch.pred_select = !!instr->pred;
+		bc->fetch.pred_condition = instr->pred & 1;
+
+		struct ir2_src *src = instr->src;
+
+		if (instr->fetch.opc == VTX_FETCH) {
+			instr_fetch_vtx_t *vtx = &bc->fetch.vtx;
+
+			assert(instr->fetch.vtx.const_idx <= 0x1f);
+			assert(instr->fetch.vtx.const_idx_sel <= 0x3);
+
+			vtx->src_reg = src_to_reg(ctx, src);
+			vtx->src_swiz = fetch_swizzle(ctx, src, 1);
+			vtx->dst_reg = dst_to_reg(ctx, instr);
+			vtx->dst_swiz = fetch_dst_swiz(ctx, instr);
+
+			vtx->must_be_one = 1;
+			vtx->const_index = instr->fetch.vtx.const_idx;
+			vtx->const_index_sel = instr->fetch.vtx.const_idx_sel;
+
+			/* other fields will be patched */
+
+			/* XXX seems like every FETCH but the first has
+			 * this bit set:
+			 */
+			vtx->reserved3 = instr->idx ? 0x1 : 0x0;
+			vtx->reserved0 = instr->idx ? 0x2 : 0x3;
+		} else if (instr->fetch.opc == TEX_FETCH) {
+			instr_fetch_tex_t *tex = &bc->fetch.tex;
+
+			tex->src_reg = src_to_reg(ctx, src);
+			tex->src_swiz = fetch_swizzle(ctx, src, 3);
+			tex->dst_reg = dst_to_reg(ctx, instr);
+			tex->dst_swiz = fetch_dst_swiz(ctx, instr);
+			/* tex->const_idx = patch_fetches */
+			tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
+			tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
+			tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->use_comp_lod = ctx->so->type == SHADER_FRAGMENT;
+			tex->use_reg_lod = instr->src_count == 2;
+			tex->sample_location = SAMPLE_CENTER;
+			tex->tx_coord_denorm = instr->fetch.tex.is_rect;
+		} else if (instr->fetch.opc == TEX_SET_TEX_LOD) {
+			instr_fetch_tex_t *tex = &bc->fetch.tex;
+
+			tex->src_reg = src_to_reg(ctx, src);
+			tex->src_swiz = fetch_swizzle(ctx, src, 1);
+			tex->dst_reg = 0;
+			tex->dst_swiz = 0xfff;
+
+			tex->mag_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->min_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->mip_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->aniso_filter = ANISO_FILTER_USE_FETCH_CONST;
+			tex->arbitrary_filter = ARBITRARY_FILTER_USE_FETCH_CONST;
+			tex->vol_mag_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->vol_min_filter = TEX_FILTER_USE_FETCH_CONST;
+			tex->use_comp_lod = 1;
+			tex->use_reg_lod = 0;
+			tex->sample_location = SAMPLE_CENTER;
+		} else {
+			assert(0);
+		}
+		return;
+	}
+
+	instr_v = sched->instr;
+	instr_s = sched->instr_s;
+
+	if (instr_v) {
+		struct ir2_src src1, src2, *src3;
+
+		src1 = instr_v->src[0];
+		src2 = instr_v->src[instr_v->src_count > 1];
+		src3 = instr_v->src_count == 3 ? &instr_v->src[2] : NULL;
+
+		bc->alu.vector_opc = instr_v->alu.vector_opc;
+		bc->alu.vector_write_mask = alu_write_mask(ctx, instr_v);
+		bc->alu.vector_dest = dst_to_reg(ctx, instr_v);
+		bc->alu.vector_clamp = instr_v->alu.saturate;
+		bc->alu.export_data = instr_v->alu.export >= 0;
+
+		/* single operand SETEv, use 0.0f as src2 */
+		if (instr_v->src_count == 1 &&
+			(bc->alu.vector_opc == SETEv ||
+			bc->alu.vector_opc == SETNEv ||
+			bc->alu.vector_opc == SETGTv ||
+			bc->alu.vector_opc == SETGTEv))
+			src2 = ir2_zero(ctx);
+
+		/* export32 instr for a20x hw binning has this bit set..
+		 * it seems to do more than change the base address of constants
+		 * XXX this is a hack
+		 */
+		bc->alu.relative_addr =
+			(bc->alu.export_data && bc->alu.vector_dest == 32);
+
+		bc->alu.src1_reg_byte = src_reg_byte(ctx, &src1);
+		bc->alu.src1_swiz = alu_swizzle(ctx, instr_v, &src1);
+		bc->alu.src1_reg_negate = src1.negate;
+		bc->alu.src1_sel = src1.type != IR2_SRC_CONST;
+
+		bc->alu.src2_reg_byte = src_reg_byte(ctx, &src2);
+		bc->alu.src2_swiz = alu_swizzle(ctx, instr_v, &src2);
+		bc->alu.src2_reg_negate = src2.negate;
+		bc->alu.src2_sel = src2.type != IR2_SRC_CONST;
+
+		if (src3) {
+			bc->alu.src3_reg_byte = src_reg_byte(ctx, src3);
+			bc->alu.src3_swiz = alu_swizzle(ctx, instr_v, src3);
+			bc->alu.src3_reg_negate = src3->negate;
+			bc->alu.src3_sel = src3->type != IR2_SRC_CONST;
+		}
+
+		bc->alu.pred_select = instr_v->pred;
+	}
+
+	if (instr_s) {
+		struct ir2_src *src = instr_s->src;
+
+		bc->alu.scalar_opc = instr_s->alu.scalar_opc;
+		bc->alu.scalar_write_mask = alu_write_mask(ctx, instr_s);
+		bc->alu.scalar_dest = dst_to_reg(ctx, instr_s);
+		bc->alu.scalar_clamp = instr_s->alu.saturate;
+		bc->alu.export_data = instr_s->alu.export >= 0;
+
+		if (instr_s->src_count == 1) {
+			bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
+			bc->alu.src3_swiz = alu_swizzle_scalar(ctx, src);
+			bc->alu.src3_reg_negate = src->negate;
+			bc->alu.src3_sel = src->type != IR2_SRC_CONST;
+		} else {
+			assert(instr_s->src_count == 2);
+
+			bc->alu.src3_reg_byte = src_reg_byte(ctx, src);
+			bc->alu.src3_swiz = alu_swizzle_scalar2(ctx, src, instr_s->alu.src1_swizzle);
+			bc->alu.src3_reg_negate = src->negate;
+			bc->alu.src3_sel = src->type != IR2_SRC_CONST;;
+		}
+
+		if (instr_v)
+			assert(instr_s->pred == instr_v->pred);
+		bc->alu.pred_select = instr_s->pred;
+	}
+
+	*is_fetch = false;
+	return;
+}
+
+static unsigned
+write_cfs(struct ir2_context *ctx, instr_cf_t * cfs, unsigned cf_idx,
+		  instr_cf_alloc_t *alloc, instr_cf_exec_t *exec)
+{
+	assert(exec->count);
+
+	if (alloc)
+		cfs[cf_idx++].alloc = *alloc;
+
+	/* for memory alloc offset for patching */
+	if (alloc && alloc->buffer_select == SQ_MEMORY &&
+		ctx->info->export32_offset == -1)
+		ctx->info->export32_offset = cf_idx / 2 * 3;
+
+	cfs[cf_idx++].exec = *exec;
+	exec->address += exec->count;
+	exec->serialize = 0;
+	exec->count = 0;
+
+	return cf_idx;
+}
+
+/* assemble the final shader */
+void assemble(struct ir2_context *ctx)
+{
+	/* hw seems to have a limit of 384 (num_cf/2+num_instr <= 384)
+	 * address is 9 bits so could it be 512 ?
+	 */
+	instr_cf_t cfs[384];
+	instr_t bytecode[384], bc;
+	unsigned block_addr[128];
+	unsigned num_cf = 0;
+
+	/* CF instr state */
+	instr_cf_exec_t exec = {.opc = EXEC};
+	instr_cf_alloc_t alloc = {.opc = ALLOC};
+
+	int sync_id, sync_id_prev = -1;
+	bool is_fetch = false;
+	bool need_sync = true;
+	bool need_alloc = false;
+	unsigned block_idx = 0;
+
+	ctx->info->export32_offset = -1;
+	ctx->info->num_fetch_instrs = 0;
+
+	if ((ctx->so->type == SHADER_VERTEX && ctx->so->f.inputs_count == 0) ||
+		ctx->info == &ctx->so->info[1]) { /* hack to check binning variant */
+		alloc.buffer_select = SQ_PARAMETER_PIXEL;
+		cfs[num_cf++].alloc = alloc;
+	}
+
+	block_addr[0] = 0;
+
+	for (int i = 0, j = 0; j < ctx->instr_sched_count; j++) {
+		struct ir2_instr *instr = ctx->instr_sched[j].instr;
+
+		/* catch IR2_CF since it isn't a regular instruction */
+		if (instr && instr->type == IR2_CF) {
+			assert(!need_alloc); /* XXX */
+
+			/* flush any exec cf before inserting jmp */
+			if (exec.count)
+				num_cf = write_cfs(ctx, cfs, num_cf, NULL, &exec);
+
+			cfs[num_cf++].jmp_call = (instr_cf_jmp_call_t) {
+				.opc = COND_JMP,
+				.address = instr->cf.block_idx, /* will be fixed later */
+				.force_call = !instr->pred,
+				.predicated_jmp = 1,
+				.direction = instr->cf.block_idx > instr->block_idx,
+				.condition = instr->pred & 1,
+			};
+			continue;
+		}
+
+		/* fill the 3 dwords for the instruction */
+		fill_instr(ctx, &ctx->instr_sched[j], &bc, &is_fetch);
+
+		/* we need to sync between ALU/VTX_FETCH/TEX_FETCH types */
+		sync_id = 0;
+		if (is_fetch)
+			sync_id = bc.fetch.opc == VTX_FETCH ? 1 : 2;
+
+		need_sync = sync_id != sync_id_prev;
+		sync_id_prev = sync_id;
+
+		unsigned block;
+		{
+
+			if (ctx->instr_sched[j].instr)
+				block = ctx->instr_sched[j].instr->block_idx;
+			else
+				block = ctx->instr_sched[j].instr_s->block_idx;
+
+			assert(block_idx <= block);
+		}
+
+		/* info for patching */
+		if (is_fetch) {
+			struct ir2_fetch_info *info =
+				&ctx->info->fetch_info[ctx->info->num_fetch_instrs++];
+			info->offset = i * 3;	/* add cf offset later */
+
+			if (bc.fetch.opc == VTX_FETCH) {
+				info->vtx.dst_swiz = bc.fetch.vtx.dst_swiz;
+			} else if (bc.fetch.opc == TEX_FETCH) {
+				info->tex.samp_id = instr->fetch.tex.samp_id;
+				info->tex.src_swiz = bc.fetch.tex.src_swiz;
+			} else {
+				ctx->info->num_fetch_instrs--;
+			}
+		}
+
+		/* exec cf after 6 instr or when switching between fetch / alu */
+		if (exec.count == 6 || (exec.count && (need_sync || block != block_idx))) {
+			num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
+			need_alloc = false;
+		}
+
+		/* update block_addrs for jmp patching */
+		while (block_idx < block)
+			block_addr[++block_idx] = num_cf;
+
+		/* export - fill alloc cf */
+		if (!is_fetch && bc.alu.export_data) {
+			/* get the export buffer from either vector/scalar dest */
+			instr_alloc_type_t buffer =
+				export_buf(bc.alu.vector_dest);
+			if (bc.alu.scalar_write_mask) {
+				if (bc.alu.vector_write_mask)
+					assert(buffer == export_buf(bc.alu.scalar_dest));
+				buffer = export_buf(bc.alu.scalar_dest);
+			}
+
+			/* flush previous alloc if the buffer changes */
+			bool need_new_alloc = buffer != alloc.buffer_select;
+
+			/* memory export always in 32/33 pair, new alloc on 32 */
+			if (bc.alu.vector_dest == 32)
+				need_new_alloc = true;
+
+			if (need_new_alloc && exec.count) {
+				num_cf = write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
+				need_alloc = false;
+			}
+
+			need_alloc |= need_new_alloc;
+
+			alloc.size = 0;
+			alloc.buffer_select = buffer;
+
+			if (buffer == SQ_PARAMETER_PIXEL && ctx->so->type == SHADER_VERTEX)
+				alloc.size = ctx->so->f.inputs_count - 1;
+
+			if (buffer == SQ_POSITION)
+				alloc.size = ctx->so->writes_psize;
+		}
+
+		if (is_fetch)
+			exec.serialize |= 0x1 << exec.count * 2;
+		if (need_sync)
+			exec.serialize |= 0x2 << exec.count * 2;
+
+		need_sync = false;
+		exec.count += 1;
+		bytecode[i++] = bc;
+	}
+
+	/* final exec cf */
+	exec.opc = EXEC_END;
+	num_cf =
+		write_cfs(ctx, cfs, num_cf, need_alloc ? &alloc : NULL, &exec);
+
+	/* insert nop to get an even # of CFs */
+	if (num_cf % 2)
+		cfs[num_cf++] = (instr_cf_t) {
+		.opc = NOP};
+
+	/* patch cf addrs */
+	for (int idx = 0; idx < num_cf; idx++) {
+		switch (cfs[idx].opc) {
+		case NOP:
+		case ALLOC:
+			break;
+		case EXEC:
+		case EXEC_END:
+			cfs[idx].exec.address += num_cf / 2;
+			break;
+		case COND_JMP:
+            cfs[idx].jmp_call.address = block_addr[cfs[idx].jmp_call.address];
+			break;
+		default:
+			assert(0);
+		}
+	}
+
+	/* concatenate cfs and alu/fetch */
+	uint32_t cfdwords = num_cf / 2 * 3;
+	uint32_t alufetchdwords = exec.address * 3;
+	uint32_t sizedwords = cfdwords + alufetchdwords;
+	uint32_t *dwords = malloc(sizedwords * 4);
+	assert(dwords);
+	memcpy(dwords, cfs, cfdwords * 4);
+	memcpy(&dwords[cfdwords], bytecode, alufetchdwords * 4);
+
+	/* finalize ir2_shader_info */
+	ctx->info->dwords = dwords;
+	ctx->info->sizedwords = sizedwords;
+	for (int i = 0; i < ctx->info->num_fetch_instrs; i++)
+		ctx->info->fetch_info[i].offset += cfdwords;
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		DBG("disassemble: type=%d", ctx->so->type);
+		disasm_a2xx(dwords, sizedwords, 0, ctx->so->type);
+	}
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
new file mode 100644
index 0000000000..93c4246cc6
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir.c
@@ -0,0 +1,1184 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+#include "ir2_private.h"
+#include "nir/tgsi_to_nir.h"
+
+#include "freedreno_util.h"
+#include "fd2_program.h"
+
+static const nir_shader_compiler_options options = {
+	.lower_fpow = true,
+	.lower_flrp32 = true,
+	.lower_fmod32 = true,
+	.lower_fdiv = true,
+	.lower_fceil = true,
+	.fuse_ffma = true,
+	/* .fdot_replicates = true, it is replicated, but it makes things worse */
+	.lower_all_io_to_temps = true,
+	.vertex_id_zero_based = true, /* its not implemented anyway */
+};
+
+struct nir_shader *
+ir2_tgsi_to_nir(const struct tgsi_token *tokens)
+{
+	return tgsi_to_nir(tokens, &options);
+}
+
+const nir_shader_compiler_options *
+ir2_get_compiler_options(void)
+{
+	return &options;
+}
+
+#define OPT(nir, pass, ...) ({                             \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   this_progress;                                          \
+})
+#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static void
+ir2_optimize_loop(nir_shader *s)
+{
+	bool progress;
+	do {
+		progress = false;
+
+		OPT_V(s, nir_lower_vars_to_ssa);
+		progress |= OPT(s, nir_opt_copy_prop_vars);
+		progress |= OPT(s, nir_copy_prop);
+		progress |= OPT(s, nir_opt_dce);
+		progress |= OPT(s, nir_opt_cse);
+		/* progress |= OPT(s, nir_opt_gcm, true); */
+		progress |= OPT(s, nir_opt_peephole_select, UINT_MAX);
+		progress |= OPT(s, nir_opt_intrinsics);
+		progress |= OPT(s, nir_opt_algebraic);
+		progress |= OPT(s, nir_opt_constant_folding);
+		progress |= OPT(s, nir_opt_dead_cf);
+		if (OPT(s, nir_opt_trivial_continues)) {
+			progress |= true;
+			/* If nir_opt_trivial_continues makes progress, then we need to clean
+			 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+			 * to make progress.
+			 */
+			OPT(s, nir_copy_prop);
+			OPT(s, nir_opt_dce);
+		}
+		progress |= OPT(s, nir_opt_loop_unroll, nir_var_all);
+		progress |= OPT(s, nir_opt_if);
+		progress |= OPT(s, nir_opt_remove_phis);
+		progress |= OPT(s, nir_opt_undef);
+
+	}
+	while (progress);
+}
+
+/* trig workarounds is the same as ir3.. but we don't want to include ir3 */
+bool ir3_nir_apply_trig_workarounds(nir_shader * shader);
+
+int
+ir2_optimize_nir(nir_shader *s, bool lower)
+{
+	struct nir_lower_tex_options tex_options = {
+		.lower_txp = ~0u,
+		.lower_rect = 0,
+	};
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(s, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	OPT_V(s, nir_opt_global_to_local);
+	OPT_V(s, nir_lower_regs_to_ssa);
+	OPT_V(s, nir_lower_vars_to_ssa);
+	OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out);
+
+	if (lower) {
+		OPT_V(s, ir3_nir_apply_trig_workarounds);
+		OPT_V(s, nir_lower_tex, &tex_options);
+	}
+
+	ir2_optimize_loop(s);
+
+	OPT_V(s, nir_remove_dead_variables, nir_var_local);
+	OPT_V(s, nir_move_load_const);
+
+	/* TODO we dont want to get shaders writing to depth for depth textures */
+	if (s->info.stage == MESA_SHADER_FRAGMENT) {
+		nir_foreach_variable(var, &s->outputs) {
+			if (var->data.location == FRAG_RESULT_DEPTH)
+				return -1;
+		}
+	}
+
+	return 0;
+}
+
+static struct ir2_src
+load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp)
+{
+	struct fd2_shader_stateobj *so = ctx->so;
+	unsigned imm_ncomp, swiz, idx, i, j;
+	uint32_t *value = (uint32_t*) value_f;
+
+	/* try to merge with existing immediate (TODO: try with neg) */
+	for (idx = 0; idx < so->num_immediates; idx++) {
+		swiz = 0;
+		imm_ncomp = so->immediates[idx].ncomp;
+		for (i = 0; i < ncomp; i++) {
+			for (j = 0; j < imm_ncomp; j++) {
+				if (value[i] == so->immediates[idx].val[j])
+					break;
+			}
+			if (j == imm_ncomp) {
+				if (j == 4)
+					break;
+				so->immediates[idx].val[imm_ncomp++] = value[i];
+			}
+			swiz |= swiz_set(j, i);
+		}
+		/* matched all components */
+		if (i == ncomp)
+			break;
+	}
+
+	/* need to allocate new immediate */
+	if (idx == so->num_immediates) {
+		swiz = 0;
+		imm_ncomp = 0;
+		for (i = 0; i < ncomp; i++) {
+			for (j = 0; j < imm_ncomp; j++) {
+				if (value[i] == ctx->so->immediates[idx].val[j])
+					break;
+			}
+			if (j == imm_ncomp) {
+				so->immediates[idx].val[imm_ncomp++] = value[i];
+			}
+			swiz |= swiz_set(j, i);
+		}
+		so->num_immediates++;
+	}
+	so->immediates[idx].ncomp = imm_ncomp;
+
+	return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST);
+}
+
+struct ir2_src
+ir2_zero(struct ir2_context *ctx)
+{
+	return load_const(ctx, (float[]) {0.0f, 0.0f, 0.0f, 0.0f}, 4);
+}
+
+static void
+update_range(struct ir2_context *ctx, struct ir2_reg *reg)
+{
+	if (!reg->initialized) {
+		reg->initialized = true;
+		reg->loop_depth = ctx->loop_depth;
+	}
+
+	if (ctx->loop_depth > reg->loop_depth) {
+		reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1];
+	} else {
+		reg->loop_depth = ctx->loop_depth;
+		reg->block_idx_free = -1;
+	}
+
+	/* for regs we want to free at the end of the loop in any case
+	 * XXX dont do this for ssa
+	 */
+	if (reg->loop_depth)
+		reg->block_idx_free = ctx->loop_last_block[reg->loop_depth];
+}
+
+static struct ir2_src
+make_src(struct ir2_context *ctx, nir_src src)
+{
+	struct ir2_src res = {};
+	struct ir2_reg *reg;
+
+	nir_const_value *const_value = nir_src_as_const_value(src);
+
+	if (const_value) {
+		assert(src.is_ssa);
+		return load_const(ctx, &const_value->f32[0], src.ssa->num_components);
+	}
+
+	if (!src.is_ssa) {
+		res.num = src.reg.reg->index;
+		res.type = IR2_SRC_REG;
+		reg = &ctx->reg[res.num];
+	} else {
+		assert(ctx->ssa_map[src.ssa->index] >= 0);
+		res.num = ctx->ssa_map[src.ssa->index];
+		res.type = IR2_SRC_SSA;
+		reg = &ctx->instr[res.num].ssa;
+	}
+
+	update_range(ctx, reg);
+	return res;
+}
+
+static void
+set_index(struct ir2_context *ctx, nir_dest * dst,
+		  struct ir2_instr *instr)
+{
+	struct ir2_reg *reg = &instr->ssa;
+
+	if (dst->is_ssa) {
+		ctx->ssa_map[dst->ssa.index] = instr->idx;
+	} else {
+		assert(instr->is_ssa);
+		reg = &ctx->reg[dst->reg.reg->index];
+
+		instr->is_ssa = false;
+		instr->reg = reg;
+	}
+	update_range(ctx, reg);
+}
+
+static struct ir2_instr *
+ir2_instr_create(struct ir2_context *ctx, int type)
+{
+	struct ir2_instr *instr;
+
+	instr = &ctx->instr[ctx->instr_count++];
+	instr->idx = ctx->instr_count - 1;
+	instr->type = type;
+	instr->block_idx = ctx->block_idx;
+	instr->pred = ctx->pred;
+	instr->is_ssa = true;
+	return instr;
+}
+
+static struct ir2_instr *
+instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp)
+{
+	/* emit_alu will fixup instrs that don't map directly */
+	static const struct ir2_opc {
+		int8_t scalar, vector;
+	} nir_ir2_opc[nir_num_opcodes+1] = {
+		[0 ... nir_num_opcodes - 1] = {-1, -1},
+
+		[nir_op_fmov] = {MAXs, MAXv},
+		[nir_op_fsign] = {-1, CNDGTEv},
+		[nir_op_fnot] = {SETEs, SETEv},
+		[nir_op_f2b] = {SETNEs, SETNEv},
+		[nir_op_for] = {MAXs, MAXv},
+		[nir_op_fand] = {MINs, MINv},
+		[nir_op_fxor] = {-1, SETNEv},
+		[nir_op_fadd] = {ADDs, ADDv},
+		[nir_op_fsub] = {ADDs, ADDv},
+		[nir_op_fmul] = {MULs, MULv},
+		[nir_op_ffma] = {-1, MULADDv},
+		[nir_op_fmax] = {MAXs, MAXv},
+		[nir_op_fmin] = {MINs, MINv},
+		[nir_op_ffloor] = {FLOORs, FLOORv},
+		[nir_op_ffract] = {FRACs, FRACv},
+		[nir_op_ftrunc] = {TRUNCs, TRUNCv},
+		[nir_op_fdot2] = {-1, DOT2ADDv},
+		[nir_op_fdot3] = {-1, DOT3v},
+		[nir_op_fdot4] = {-1, DOT4v},
+		[nir_op_sge] = {-1, SETGTEv},
+		[nir_op_slt] = {-1, SETGTv},
+		[nir_op_sne] = {-1, SETNEv},
+		[nir_op_seq] = {-1, SETEv},
+		[nir_op_fcsel] = {-1, CNDEv},
+		[nir_op_frsq] = {RECIPSQ_IEEE, -1},
+		[nir_op_frcp] = {RECIP_IEEE, -1},
+		[nir_op_flog2] = {LOG_IEEE, -1},
+		[nir_op_fexp2] = {EXP_IEEE, -1},
+		[nir_op_fsqrt] = {SQRT_IEEE, -1},
+		[nir_op_fcos] = {COS, -1},
+		[nir_op_fsin] = {SIN, -1},
+		/* no fsat, fneg, fabs since source mods deal with those */
+		/* XXX non float ops.. */
+		[nir_op_imov] = {MAXs, MAXv},
+		[nir_op_bcsel] = {-1, CNDEv},
+		[nir_op_inot] = {SETEs, SETEv},
+		[nir_op_ior] = {MAXs, MAXv},
+		[nir_op_iand] = {MINs, MINv},
+
+		/* so we can use this function with non-nir op */
+#define ir2_op_cube nir_num_opcodes
+		[ir2_op_cube] = {-1, CUBEv},
+	};
+
+	struct ir2_opc op = nir_ir2_opc[opcode];
+	assert(op.vector >= 0 || op.scalar >= 0);
+
+	struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU);
+	instr->alu.vector_opc = op.vector;
+	instr->alu.scalar_opc = op.scalar;
+	instr->alu.export = -1;
+	instr->alu.write_mask = (1 << ncomp) - 1;
+	instr->src_count = opcode == ir2_op_cube ? 2 :
+		nir_op_infos[opcode].num_inputs;
+	instr->ssa.ncomp = ncomp;
+	return instr;
+}
+
+static struct ir2_instr *
+instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode,
+		uint8_t write_mask, struct ir2_instr *share_reg)
+{
+	struct ir2_instr *instr;
+	struct ir2_reg *reg;
+	unsigned ncomp, max_comp;
+
+	reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++];
+	reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1);
+
+	instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask));
+	instr->alu.write_mask = write_mask;
+	instr->reg = reg;
+	instr->is_ssa = false;
+	return instr;
+}
+
+
+static struct ir2_instr *
+instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst)
+{
+	struct ir2_instr *instr;
+	instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst));
+	set_index(ctx, dst, instr);
+	return instr;
+}
+
+static struct ir2_instr *
+ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst,
+		instr_fetch_opc_t opc)
+{
+	struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH);
+	instr->fetch.opc = opc;
+	instr->src_count = 1;
+	instr->ssa.ncomp = nir_dest_num_components(*dst);
+	set_index(ctx, dst, instr);
+	return instr;
+}
+
+static struct ir2_src
+make_src_noconst(struct ir2_context *ctx, nir_src src)
+{
+	struct ir2_instr *instr;
+
+	if (nir_src_as_const_value(src)) {
+		assert(src.is_ssa);
+		instr = instr_create_alu(ctx, nir_op_fmov, src.ssa->num_components);
+		instr->src[0] = make_src(ctx, src);
+		return ir2_src(instr->idx, 0, IR2_SRC_SSA);
+	}
+
+	return make_src(ctx, src);
+}
+
+static void
+emit_alu(struct ir2_context *ctx, nir_alu_instr * alu)
+{
+	const nir_op_info *info = &nir_op_infos[alu->op];
+	nir_dest *dst = &alu->dest.dest;
+	struct ir2_instr *instr;
+	struct ir2_src tmp;
+	unsigned ncomp;
+
+	/* get the number of dst components */
+	if (dst->is_ssa) {
+		ncomp = dst->ssa.num_components;
+	} else {
+		ncomp = 0;
+		for (int i = 0; i < 4; i++)
+			ncomp += !!(alu->dest.write_mask & 1 << i);
+	}
+
+	instr = instr_create_alu(ctx, alu->op, ncomp);
+	set_index(ctx, dst, instr);
+	instr->alu.saturate = alu->dest.saturate;
+	instr->alu.write_mask = alu->dest.write_mask;
+
+	for (int i = 0; i < info->num_inputs; i++) {
+		nir_alu_src *src = &alu->src[i];
+
+		/* compress swizzle with writemask when applicable */
+		unsigned swiz = 0, j = 0;
+		for (int i = 0; i < 4; i++) {
+			if (!(alu->dest.write_mask & 1 << i) && !info->output_size)
+				continue;
+			swiz |= swiz_set(src->swizzle[i], j++);
+		}
+
+		instr->src[i] = make_src(ctx, src->src);
+		instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz);
+		instr->src[i].negate = src->negate;
+		instr->src[i].abs = src->abs;
+	}
+
+	/* workarounds for NIR ops that don't map directly to a2xx ops */
+	switch (alu->op) {
+	case nir_op_slt:
+		tmp = instr->src[0];
+		instr->src[0] = instr->src[1];
+		instr->src[1] = tmp;
+		break;
+	case nir_op_fcsel:
+	case nir_op_bcsel:
+		tmp = instr->src[1];
+		instr->src[1] = instr->src[2];
+		instr->src[2] = tmp;
+		break;
+	case nir_op_fsub:
+		instr->src[1].negate = !instr->src[1].negate;
+		break;
+	case nir_op_fdot2:
+		instr->src_count = 3;
+		instr->src[2] = ir2_zero(ctx);
+		break;
+	case nir_op_fsign: {
+		/* we need an extra instruction to deal with the zero case */
+		struct ir2_instr *tmp;
+
+		/* tmp = x == 0 ? 0 : 1 */
+		tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp);
+		tmp->src[0] = instr->src[0];
+		tmp->src[1] = ir2_zero(ctx);
+		tmp->src[2] = load_const(ctx, (float[]) {1.0f}, 1);
+		tmp->src[2].swizzle = swiz_merge(instr->src[2].swizzle, IR2_SWIZZLE_XXXX);
+
+		/* result = x >= 0 ? tmp : -tmp */
+		instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
+		instr->src[2] = instr->src[1];
+		instr->src[2].negate = true;
+		instr->src_count = 3;
+	} break;
+	default:
+		break;
+	}
+}
+
+static unsigned
+input_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
+{
+	int slot = -1;
+	unsigned idx = nir_intrinsic_base(intr);
+	nir_foreach_variable(var, &ctx->nir->inputs) {
+		if (var->data.driver_location == idx) {
+			slot = var->data.location;
+			break;
+		}
+	}
+	assert(slot != -1);
+	return slot;
+}
+
+static unsigned
+output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr)
+{
+	int slot = -1;
+	unsigned idx = nir_intrinsic_base(intr);
+	nir_foreach_variable(var, &ctx->nir->outputs) {
+		if (var->data.driver_location == idx) {
+			slot = var->data.location;
+			break;
+		}
+	}
+	assert(slot != -1);
+	return slot;
+}
+
+static unsigned
+vertex_output_link(struct ir2_context *ctx, unsigned slot)
+{
+	unsigned i;
+	for (i = 0; i < ctx->so->f.inputs_count; i++) {
+		if (ctx->so->f.inputs[i].slot == slot)
+			break;
+	}
+	if (i == ctx->so->f.inputs_count)
+		return ~0u;
+	return i;
+}
+
+static void
+load_input(struct ir2_context *ctx, nir_dest *dst, unsigned slot)
+{
+	struct ir2_instr *instr;
+
+	if (ctx->so->type == SHADER_VERTEX) {
+		assert(slot >= VERT_ATTRIB_GENERIC0 && slot <= VERT_ATTRIB_GENERIC15);
+		slot -= VERT_ATTRIB_GENERIC0;
+		instr = ir2_instr_create_fetch(ctx, dst, 0);
+		instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT);
+		instr->fetch.vtx.const_idx = 20 + (slot / 3);
+		instr->fetch.vtx.const_idx_sel = slot % 3;
+		return;
+	}
+
+	if (slot == VARYING_SLOT_PNTC) {
+		/* need to invert y */
+		instr = instr_create_alu_dest(ctx, nir_op_ffma, dst);
+		instr->src[0] = ir2_src(ctx->so->f.inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT);
+		instr->src[0].abs = true;
+		instr->src[1] = load_const(ctx, (float[]) {1.0f, -1.0f}, 2);
+		instr->src[2] = load_const(ctx, (float[]) {0.0f, 1.0f}, 2);
+	} else if (slot == VARYING_SLOT_POS) {
+		/* TODO: only components that are required by fragment shader */
+		instr = instr_create_alu_reg(ctx, nir_op_ffma, 3, NULL);
+		instr->src[0] = ir2_src(ctx->so->f.inputs_count, 0, IR2_SRC_INPUT);
+		instr->src[0].abs = true;
+		instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST);
+		instr->src[2] = ir2_src(64, IR2_SWIZZLE_ZW, IR2_SRC_CONST);
+
+		instr = instr_create_alu_reg(ctx, nir_op_fmov, 4, instr);
+		instr->src[0] = ir2_src(ctx->so->f.fragcoord, 0, IR2_SRC_INPUT);
+
+		instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr);
+		instr->src[0] = ir2_src(ctx->so->f.fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT);
+
+		unsigned reg_idx = instr->reg - ctx->reg; /* XXX */
+		instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
+		instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
+	} else {
+		/* find input that was assigned to this slot */
+		unsigned idx;
+		for (idx = 0; idx < ctx->so->f.inputs_count; idx++)
+			if (ctx->so->f.inputs[idx].slot == slot)
+				break;
+
+		assert(idx < ctx->so->f.inputs_count);
+		instr = instr_create_alu_dest(ctx, nir_op_fmov, dst);
+		instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT);
+	}
+}
+
+static void
+store_output(struct ir2_context *ctx, nir_src src, unsigned slot, unsigned ncomp)
+{
+	struct ir2_instr *instr;
+	unsigned idx = ~0u;
+
+	if (ctx->so->type == SHADER_VERTEX) {
+		if (slot == VARYING_SLOT_POS) {
+			ctx->position = make_src(ctx, src);
+			idx = 62;
+		} else if (slot == VARYING_SLOT_PSIZ) {
+			idx = 63;
+			ctx->so->writes_psize = true;
+		} else {
+			idx = vertex_output_link(ctx, slot);
+		}
+	} else {
+		if (slot == FRAG_RESULT_COLOR || slot == FRAG_RESULT_DATA0) {
+			idx = 0;
+		}
+	}
+
+	if (idx == ~0u)
+		return;
+
+	instr = instr_create_alu(ctx, nir_op_fmov, ncomp);
+	instr->src[0] = make_src(ctx, src);
+	instr->alu.export = idx;
+}
+
+static void
+emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr)
+{
+	struct ir2_instr *instr;
+	nir_const_value *const_offset;
+	nir_deref_instr *deref;
+	unsigned idx;
+
+	switch (intr->intrinsic) {
+	case nir_intrinsic_load_input:
+		load_input(ctx, &intr->dest, input_slot(ctx, intr));
+		break;
+	case nir_intrinsic_store_output:
+		store_output(ctx, intr->src[0], output_slot(ctx, intr), intr->num_components);
+		break;
+	case nir_intrinsic_load_deref:
+		deref = nir_src_as_deref(intr->src[0]);
+		assert(deref->deref_type == nir_deref_type_var);
+		load_input(ctx, &intr->dest, deref->var->data.location);
+		break;
+	case nir_intrinsic_store_deref:
+		deref = nir_src_as_deref(intr->src[0]);
+		assert(deref->deref_type == nir_deref_type_var);
+		store_output(ctx, intr->src[1], deref->var->data.location, intr->num_components);
+		break;
+	case nir_intrinsic_load_uniform:
+		const_offset = nir_src_as_const_value(intr->src[0]);
+		assert(const_offset); /* TODO can be false in ES2? */
+		idx = nir_intrinsic_base(intr);
+		idx += (uint32_t) nir_src_as_const_value(intr->src[0])->f32[0];
+		instr = instr_create_alu_dest(ctx, nir_op_fmov, &intr->dest);
+		instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST);
+		break;
+	case nir_intrinsic_discard:
+	case nir_intrinsic_discard_if:
+		instr = ir2_instr_create(ctx, IR2_ALU);
+		instr->alu.vector_opc = VECTOR_NONE;
+		if (intr->intrinsic == nir_intrinsic_discard_if) {
+			instr->alu.scalar_opc = KILLNEs;
+			instr->src[0] = make_src(ctx, intr->src[0]);
+		} else {
+			instr->alu.scalar_opc = KILLEs;
+			instr->src[0] = ir2_zero(ctx);
+		}
+		instr->alu.export = -1;
+		instr->src_count = 1;
+		break;
+	case nir_intrinsic_load_front_face:
+		/* gl_FrontFacing is in the sign of param.x
+		 * rcp required because otherwise we can't differentiate -0.0 and +0.0
+		 */
+		ctx->so->need_param = true;
+
+		struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1);
+		tmp->src[0] = ir2_src(ctx->so->f.inputs_count, 0, IR2_SRC_INPUT);
+
+		instr = instr_create_alu_dest(ctx, nir_op_fge, &intr->dest);
+		instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA);
+		instr->src[1] = ir2_zero(ctx);
+		break;
+	default:
+		compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic);
+		break;
+	}
+}
+
+static void
+emit_tex(struct ir2_context *ctx, nir_tex_instr * tex)
+{
+	bool is_rect = false, is_cube = false;
+	struct ir2_instr *instr;
+	nir_src *coord, *lod_bias;
+
+	coord = lod_bias = NULL;
+
+	for (unsigned i = 0; i < tex->num_srcs; i++) {
+		switch (tex->src[i].src_type) {
+		case nir_tex_src_coord:
+			coord = &tex->src[i].src;
+			break;
+		case nir_tex_src_bias:
+		case nir_tex_src_lod:
+			assert(!lod_bias);
+			lod_bias = &tex->src[i].src;
+			break;
+		default:
+			compile_error(ctx, "Unhandled NIR tex src type: %d\n",
+						  tex->src[i].src_type);
+			return;
+		}
+	}
+
+	switch (tex->op) {
+	case nir_texop_tex:
+	case nir_texop_txb:
+	case nir_texop_txl:
+		break;
+	default:
+		compile_error(ctx, "unimplemented texop %d\n", tex->op);
+		return;
+	}
+
+	switch (tex->sampler_dim) {
+	case GLSL_SAMPLER_DIM_2D:
+		break;
+	case GLSL_SAMPLER_DIM_RECT:
+		is_rect = true;
+		break;
+	case GLSL_SAMPLER_DIM_CUBE:
+		is_cube = true;
+		break;
+	default:
+		compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim);
+		return;
+	}
+
+	struct ir2_src src_coord = make_src_noconst(ctx, *coord);
+
+	/* for cube maps
+	 * tmp = cube(coord)
+	 * tmp.xy = tmp.xy / |tmp.z| + 1.5
+	 * coord = tmp.xyw
+	 */
+	if (is_cube) {
+		struct ir2_instr *rcp, *coord_xy;
+		unsigned reg_idx;
+
+		instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL);
+		instr->src[0] = src_coord;
+		instr->src[0].swizzle = IR2_SWIZZLE_ZZXY;
+		instr->src[1] = src_coord;
+		instr->src[1].swizzle = IR2_SWIZZLE_YXZZ;
+
+		reg_idx = instr->reg - ctx->reg; /* hacky */
+
+		rcp = instr_create_alu(ctx, nir_op_frcp, 1);
+		rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG);
+		rcp->src[0].abs = true;
+
+		coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr);
+		coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG);
+		coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
+		coord_xy->src[2] = load_const(ctx, (float[]) {1.5f, 1.5f}, 2);
+
+		src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG);
+		/* TODO: lod/bias transformed by src_coord.z ? */
+	}
+
+	instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH);
+	instr->src[0] = src_coord;
+	instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_XYW : 0;
+	instr->fetch.tex.is_cube = is_cube;
+	instr->fetch.tex.is_rect = is_rect;
+	instr->fetch.tex.samp_id = tex->sampler_index;
+
+	/* for lod/bias, we insert an extra src for the backend to deal with */
+	if (lod_bias) {
+		instr->src[1] = make_src_noconst(ctx, *lod_bias);
+		/* backend will use 2-3 components so apply swizzle */
+		swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX);
+		instr->src_count = 2;
+	}
+}
+
+static void
+setup_input(struct ir2_context *ctx, nir_variable * in)
+{
+	struct fd2_shader_stateobj *so = ctx->so;
+	unsigned array_len = MAX2(glsl_get_length(in->type), 1);
+	unsigned n = in->data.driver_location;
+	unsigned slot = in->data.location;
+
+	assert(array_len == 1);
+
+	/* handle later */
+	if (ctx->so->type == SHADER_VERTEX)
+		return;
+
+	if (ctx->so->type != SHADER_FRAGMENT)
+		compile_error(ctx, "unknown shader type: %d\n", ctx->so->type);
+
+	if (slot == VARYING_SLOT_PNTC) {
+		so->need_param = true;
+		return;
+	}
+
+	n = so->f.inputs_count++;
+
+	/* half of fragcoord from param reg, half from a varying */
+	if (slot == VARYING_SLOT_POS) {
+		so->f.fragcoord = n;
+		so->need_param = true;
+	}
+
+	so->f.inputs[n].slot = slot;
+	so->f.inputs[n].ncomp = glsl_get_components(in->type);
+
+	/* in->data.interpolation?
+	 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD
+	 */
+}
+
+static void
+emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr * undef)
+{
+	/* TODO we don't want to emit anything for undefs */
+
+	struct ir2_instr *instr;
+
+	instr = instr_create_alu_dest(ctx, nir_op_fmov,
+		&(nir_dest) {.ssa = undef->def,.is_ssa = true});
+	instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST);
+}
+
+static void
+emit_instr(struct ir2_context *ctx, nir_instr * instr)
+{
+	switch (instr->type) {
+	case nir_instr_type_alu:
+		emit_alu(ctx, nir_instr_as_alu(instr));
+		break;
+	case nir_instr_type_deref:
+		/* ignored, handled as part of the intrinsic they are src to */
+		break;
+	case nir_instr_type_intrinsic:
+		emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+		break;
+	case nir_instr_type_load_const:
+		/* dealt with when using nir_src */
+		break;
+	case nir_instr_type_tex:
+		emit_tex(ctx, nir_instr_as_tex(instr));
+		break;
+	case nir_instr_type_jump:
+		ctx->block_has_jump[ctx->block_idx] = true;
+		break;
+	case nir_instr_type_ssa_undef:
+		emit_undef(ctx, nir_instr_as_ssa_undef(instr));
+		break;
+	default:
+		break;
+	}
+}
+
+/* fragcoord.zw and a20x hw binning outputs */
+static void
+extra_position_exports(struct ir2_context *ctx, unsigned variant)
+{
+	struct ir2_instr *instr, *rcp, *sc, *wincoord, *off;
+
+	if (ctx->so->f.fragcoord < 0 && !variant)
+		return;
+
+	instr = instr_create_alu(ctx, nir_op_fmax, 1);
+	instr->src[0] = ctx->position;
+	instr->src[0].swizzle = IR2_SWIZZLE_W;
+	instr->src[1] = ir2_zero(ctx);
+
+	rcp = instr_create_alu(ctx, nir_op_frcp, 1);
+	rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA);
+
+	sc = instr_create_alu(ctx, nir_op_fmul, 4);
+	sc->src[0] = ctx->position;
+	sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
+
+	wincoord = instr_create_alu(ctx, nir_op_ffma, 4);
+	wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST);
+	wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA);
+	wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST);
+
+	/* fragcoord z/w */
+	if (ctx->so->f.fragcoord >= 0 && !variant) {
+		instr = instr_create_alu(ctx, nir_op_fmov, 1);
+		instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA);
+		instr->alu.export = ctx->so->f.fragcoord;
+
+		instr = instr_create_alu(ctx, nir_op_fmov, 1);
+		instr->src[0] = ctx->position;
+		instr->src[0].swizzle = IR2_SWIZZLE_W;
+		instr->alu.export = ctx->so->f.fragcoord;
+		instr->alu.write_mask = 2;
+	}
+
+	if (!variant)
+		return;
+
+	off = instr_create_alu(ctx, nir_op_fadd, 1);
+	off->src[0] = ir2_src(64, 0, IR2_SRC_CONST);
+	off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT);
+
+	/* 8 max set in freedreno_screen.. unneeded instrs patched out */
+	for (int i = 0; i < 8; i++) {
+		instr = instr_create_alu(ctx, nir_op_ffma, 4);
+		instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST);
+		instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA);
+		instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST);
+		instr->alu.export = 32;
+
+		instr = instr_create_alu(ctx, nir_op_ffma, 4);
+		instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST);
+		instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA);
+		instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST);
+		instr->alu.export = 33;
+	}
+}
+
+static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list);
+
+static bool
+emit_block(struct ir2_context *ctx, nir_block * block)
+{
+	struct ir2_instr *instr;
+	nir_block *succs = block->successors[0];
+
+	ctx->block_idx = block->index;
+
+	nir_foreach_instr(instr, block)
+		emit_instr(ctx, instr);
+
+	if (!succs || !succs->index)
+		return false;
+
+	/* we want to be smart and always jump and have the backend cleanup
+	 * but we are not, so there are two cases where jump is needed:
+	 *  loops (succs index lower)
+	 *  jumps (jump instruction seen in block)
+	 */
+	if (succs->index > block->index && !ctx->block_has_jump[block->index])
+		return false;
+
+	assert(block->successors[1] == NULL);
+
+	instr = ir2_instr_create(ctx, IR2_CF);
+	instr->cf.block_idx = succs->index;
+	/* XXX we can't jump to a predicated block */
+	return true;
+}
+
+static void
+emit_if(struct ir2_context *ctx, nir_if * nif)
+{
+	struct ir2_instr *instr;
+	bool jumps;
+	unsigned pred = ctx->pred, pred_idx = ctx->pred_idx;
+
+	/* XXX nested predicates are not fully implemented */
+	assert(!pred || pred == 3);
+
+	instr = ir2_instr_create(ctx, IR2_ALU);
+	instr->src[0] = make_src(ctx, nif->condition);
+	instr->src_count = 1;
+	instr->ssa.ncomp = 1;
+	instr->alu.vector_opc = VECTOR_NONE;
+	instr->alu.scalar_opc = SCALAR_NONE;
+	instr->alu.export = -1;
+	instr->alu.write_mask = 1;
+	instr->pred = 0;
+
+	if (pred) {
+		/* XXX we want this instr in its own block */
+		instr->alu.vector_opc = PRED_SETNE_PUSHv;
+		instr->src[1] = instr->src[0];
+		instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA);
+		instr->src[0].swizzle = IR2_SWIZZLE_XXXX;
+		instr->src[1].swizzle = IR2_SWIZZLE_XXXX;
+		instr->src_count = 2;
+	} else {
+		instr->alu.scalar_opc = PRED_SETNEs;
+	}
+
+	ctx->pred_idx = instr->idx;
+	ctx->pred = 3;
+
+	jumps = emit_cf_list(ctx, &nif->then_list);
+
+	/* nested predicate must jump away */
+	assert(!pred || jumps);
+
+	/* only need an "else" predicate when "then" doesn't jump away */
+	if (pred) {
+		instr = ir2_instr_create(ctx, IR2_ALU);
+		instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA);
+		instr->src_count = 1;
+		instr->ssa.ncomp = 1;
+		instr->alu.vector_opc = VECTOR_NONE;
+		instr->alu.scalar_opc = PRED_SET_POPs;
+		instr->alu.export = -1;
+		instr->alu.write_mask = 1;
+		instr->pred = 0;
+		instr->block_idx++;
+		ctx->pred_idx = instr->idx;
+	}
+
+	ctx->pred = !jumps ? 2 : pred;
+	jumps = emit_cf_list(ctx, &nif->else_list);
+	ctx->pred = pred;
+}
+
+/* get the highest block idx in the loop, so we know when
+ * we can free registers that are allocated outside the loop
+ */
+static unsigned
+loop_last_block(struct exec_list *list)
+{
+	nir_cf_node *node =
+		exec_node_data(nir_cf_node, exec_list_get_tail(list), node);
+	switch (node->type) {
+	case nir_cf_node_block:
+		return nir_cf_node_as_block(node)->index;
+	case nir_cf_node_if:
+		assert(0); /* XXX could this ever happen? */
+		return 0;
+	case nir_cf_node_loop:
+		return loop_last_block(&nir_cf_node_as_loop(node)->body);
+	default:
+		compile_error(ctx, "Not supported\n");
+		return 0;
+	}
+}
+
+static void
+emit_loop(struct ir2_context *ctx, nir_loop *nloop)
+{
+	ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body);
+	emit_cf_list(ctx, &nloop->body);
+	ctx->loop_depth--;
+}
+
+static bool
+emit_cf_list(struct ir2_context *ctx, struct exec_list *list)
+{
+	bool ret = false;
+	foreach_list_typed(nir_cf_node, node, node, list) {
+		ret = false;
+		switch (node->type) {
+		case nir_cf_node_block:
+			ret = emit_block(ctx, nir_cf_node_as_block(node));
+			break;
+		case nir_cf_node_if:
+			emit_if(ctx, nir_cf_node_as_if(node));
+			break;
+		case nir_cf_node_loop:
+			emit_loop(ctx, nir_cf_node_as_loop(node));
+			break;
+		case nir_cf_node_function:
+			compile_error(ctx, "Not supported\n");
+			break;
+		}
+	}
+	return ret;
+}
+
+static void
+variant_opt(struct ir2_context *ctx, unsigned variant)
+{
+	if (!variant)
+		return;
+
+	assert(ctx->so->type == SHADER_VERTEX);
+
+	/* kill non-position outputs for binning variant */
+	nir_foreach_block(block, nir_shader_get_entrypoint(ctx->nir)) {
+		nir_foreach_instr_safe(instr, block) {
+			if (instr->type != nir_instr_type_intrinsic)
+				continue;
+
+			nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+			unsigned slot;
+			switch (intr->intrinsic) {
+			case nir_intrinsic_store_deref: {
+				nir_deref_instr *deref = nir_src_as_deref(intr->src[0]);
+				assert(deref->deref_type == nir_deref_type_var);
+				slot = deref->var->data.location;
+			} break;
+			case nir_intrinsic_store_output:
+				slot = output_slot(ctx, intr);
+				break;
+			default:
+				continue;
+			}
+
+			if (slot != VARYING_SLOT_POS)
+				nir_instr_remove(instr);
+		}
+	}
+
+	ir2_optimize_nir(ctx->nir, false);
+}
+
+void
+ir2_nir_compile(struct ir2_context *ctx, unsigned variant)
+{
+	struct fd2_shader_stateobj *so = ctx->so;
+
+	memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map));
+
+	ctx->nir = nir_shader_clone(NULL, so->nir);
+
+	variant_opt(ctx, variant);
+
+	/* postprocess */
+	OPT_V(ctx->nir, nir_opt_algebraic_late);
+
+	/* lower to scalar instructions that can only be scalar on a2xx */
+	OPT_V(ctx->nir, ir2_nir_lower_scalar);
+
+	OPT_V(ctx->nir, nir_lower_to_source_mods);
+	OPT_V(ctx->nir, nir_copy_prop);
+	OPT_V(ctx->nir, nir_opt_dce);
+	OPT_V(ctx->nir, nir_opt_move_comparisons);
+
+	OPT_V(ctx->nir, nir_lower_locals_to_regs);
+
+	OPT_V(ctx->nir, nir_convert_from_ssa, true);
+
+	OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest);
+	OPT_V(ctx->nir, nir_lower_vec_to_movs);
+
+	OPT_V(ctx->nir, nir_opt_dce);
+
+	nir_sweep(ctx->nir);
+
+	if (fd_mesa_debug & FD_DBG_DISASM) {
+		debug_printf("----------------------\n");
+		nir_print_shader(ctx->nir, stdout);
+		debug_printf("----------------------\n");
+	}
+
+	/* fd2_shader_stateobj init */
+	if (so->type == SHADER_FRAGMENT) {
+		so->f.fragcoord = -1;
+		so->f.inputs_count = 0;
+		memset(so->f.inputs, 0, sizeof(so->f.inputs));
+	}
+
+	/* Setup inputs: */
+	nir_foreach_variable(in, &ctx->nir->inputs)
+		setup_input(ctx, in);
+
+	if (so->type == SHADER_FRAGMENT) {
+		unsigned idx;
+		for (idx = 0; idx < so->f.inputs_count; idx++) {
+			ctx->input[idx].ncomp = so->f.inputs[idx].ncomp;
+			update_range(ctx, &ctx->input[idx]);
+		}
+		/* assume we have param input and kill it later if not */
+		ctx->input[idx].ncomp = 4;
+		update_range(ctx, &ctx->input[idx]);
+	} else {
+		ctx->input[0].ncomp = 1;
+		ctx->input[2].ncomp = 1;
+		update_range(ctx, &ctx->input[0]);
+		update_range(ctx, &ctx->input[2]);
+	}
+
+	/* And emit the body: */
+	nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir);
+
+	nir_foreach_register(reg, &fxn->registers) {
+		ctx->reg[reg->index].ncomp = reg->num_components;
+		ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1);
+	}
+
+	nir_metadata_require(fxn, nir_metadata_block_index);
+	emit_cf_list(ctx, &fxn->body);
+	/* TODO emit_block(ctx, fxn->end_block); */
+
+	if (so->type == SHADER_VERTEX)
+		extra_position_exports(ctx, variant);
+
+	ralloc_free(ctx->nir);
+
+	/* kill unused param input */
+	if (so->type == SHADER_FRAGMENT && !so->need_param)
+		ctx->input[so->f.inputs_count].initialized = false;
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_nir_lower_scalar.c b/src/gallium/drivers/freedreno/a2xx/ir2_nir_lower_scalar.c
new file mode 100644
index 0000000000..2b72a86b3e
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_nir_lower_scalar.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+/* some operations can only be scalar on a2xx:
+ *  rsq, rcp, log2, exp2, cos, sin, sqrt
+ * mostly copy-pasted from nir_lower_alu_to_scalar.c
+ */
+
+#include "ir2_private.h"
+#include "compiler/nir/nir_builder.h"
+
+static void
+nir_alu_ssa_dest_init(nir_alu_instr * instr, unsigned num_components,
+					  unsigned bit_size)
+{
+	nir_ssa_dest_init(&instr->instr, &instr->dest.dest, num_components,
+					  bit_size, NULL);
+	instr->dest.write_mask = (1 << num_components) - 1;
+}
+
+static void
+lower_reduction(nir_alu_instr * instr, nir_op chan_op, nir_op merge_op,
+				nir_builder * builder)
+{
+	unsigned num_components = nir_op_infos[instr->op].input_sizes[0];
+
+	nir_ssa_def *last = NULL;
+	for (unsigned i = 0; i < num_components; i++) {
+		nir_alu_instr *chan =
+			nir_alu_instr_create(builder->shader, chan_op);
+		nir_alu_ssa_dest_init(chan, 1, instr->dest.dest.ssa.bit_size);
+		nir_alu_src_copy(&chan->src[0], &instr->src[0], chan);
+		chan->src[0].swizzle[0] = chan->src[0].swizzle[i];
+		if (nir_op_infos[chan_op].num_inputs > 1) {
+			assert(nir_op_infos[chan_op].num_inputs == 2);
+			nir_alu_src_copy(&chan->src[1], &instr->src[1], chan);
+			chan->src[1].swizzle[0] = chan->src[1].swizzle[i];
+		}
+		chan->exact = instr->exact;
+
+		nir_builder_instr_insert(builder, &chan->instr);
+
+		if (i == 0) {
+			last = &chan->dest.dest.ssa;
+		} else {
+			last = nir_build_alu(builder, merge_op,
+								 last, &chan->dest.dest.ssa, NULL, NULL);
+		}
+	}
+
+	assert(instr->dest.write_mask == 1);
+	nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(last));
+	nir_instr_remove(&instr->instr);
+}
+
+static bool lower_scalar(nir_alu_instr * instr, nir_builder * b)
+{
+	assert(instr->dest.dest.is_ssa);
+	assert(instr->dest.write_mask != 0);
+
+	b->cursor = nir_before_instr(&instr->instr);
+	b->exact = instr->exact;
+
+#define LOWER_REDUCTION(name, chan, merge) \
+	case name##2: \
+	case name##3: \
+	case name##4: \
+		lower_reduction(instr, chan, merge, b); \
+		return true;
+
+	switch (instr->op) {
+		/* TODO: handle these instead of lowering */
+		LOWER_REDUCTION(nir_op_fall_equal, nir_op_seq, nir_op_fand);
+		LOWER_REDUCTION(nir_op_fany_nequal, nir_op_sne, nir_op_for);
+
+	default:
+		return false;
+	case nir_op_frsq:
+	case nir_op_frcp:
+	case nir_op_flog2:
+	case nir_op_fexp2:
+	case nir_op_fcos:
+	case nir_op_fsin:
+	case nir_op_fsqrt:
+		break;
+	}
+
+	assert(nir_op_infos[instr->op].num_inputs == 1);
+
+	unsigned num_components = instr->dest.dest.ssa.num_components;
+	nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS] = { NULL };
+	unsigned chan;
+
+	if (num_components == 1)
+		return false;
+
+	for (chan = 0; chan < num_components; chan++) {
+		assert(instr->dest.write_mask & (1 << chan));
+
+		nir_alu_instr *lower = nir_alu_instr_create(b->shader, instr->op);
+
+		nir_alu_src_copy(&lower->src[0], &instr->src[0], lower);
+		lower->src[0].swizzle[0] = instr->src[0].swizzle[chan];
+
+		nir_alu_ssa_dest_init(lower, 1, instr->dest.dest.ssa.bit_size);
+		lower->dest.saturate = instr->dest.saturate;
+		comps[chan] = &lower->dest.dest.ssa;
+		lower->exact = instr->exact;
+
+		nir_builder_instr_insert(b, &lower->instr);
+	}
+
+	nir_ssa_def *vec = nir_vec(b, comps, num_components);
+
+	nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(vec));
+
+	nir_instr_remove(&instr->instr);
+	return true;
+}
+
+static bool lower_scalar_impl(nir_function_impl * impl)
+{
+	nir_builder builder;
+	nir_builder_init(&builder, impl);
+	bool progress = false;
+
+	nir_foreach_block(block, impl) {
+		nir_foreach_instr_safe(instr, block) {
+			if (instr->type == nir_instr_type_alu)
+				progress = lower_scalar(nir_instr_as_alu(instr), &builder)
+					|| progress;
+		}
+	}
+
+	nir_metadata_preserve(impl, nir_metadata_block_index |
+						  nir_metadata_dominance);
+
+	return progress;
+}
+
+bool ir2_nir_lower_scalar(nir_shader * shader)
+{
+	bool progress = false;
+
+	nir_foreach_function(function, shader) {
+		if (function->impl)
+			progress = lower_scalar_impl(function->impl) || progress;
+	}
+
+	return progress;
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_private.h b/src/gallium/drivers/freedreno/a2xx/ir2_private.h
new file mode 100644
index 0000000000..404218c918
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_private.h
@@ -0,0 +1,393 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "ir2.h"
+#include "fd2_program.h"
+#include "instr-a2xx.h"
+
+enum ir2_src_type {
+	IR2_SRC_SSA,
+	IR2_SRC_REG,
+	IR2_SRC_INPUT,
+	IR2_SRC_CONST,
+};
+
+struct ir2_src {
+	/* num can mean different things
+	 *   ssa: index of instruction
+	 *   reg: index in ctx->reg array
+	 *   input: index in ctx->input array
+	 *   const: constant index (C0, C1, etc)
+	 */
+	uint16_t num;
+	uint8_t swizzle;
+	enum ir2_src_type type : 2;
+	uint8_t abs : 1;
+	uint8_t negate : 1;
+	uint8_t : 4;
+};
+
+struct ir2_reg_component {
+	uint8_t c : 3; /* assigned x/y/z/w (7=dont write, for fetch instr) */
+	bool alloc : 1; /* is it currently allocated */
+	uint8_t ref_count; /* for ra */
+};
+
+struct ir2_reg {
+	uint8_t idx; /* assigned hardware register */
+	uint8_t ncomp;
+
+	uint8_t loop_depth;
+	bool initialized;
+	/* block_idx to free on (-1 = free on ref_count==0) */
+	int block_idx_free;
+	struct ir2_reg_component comp[4];
+};
+
+struct ir2_instr {
+	unsigned idx;
+
+	unsigned block_idx;
+
+	enum {
+		IR2_NONE,
+		IR2_FETCH,
+		IR2_ALU,
+		IR2_CF,
+	} type : 2;
+
+	/* instruction needs to be emitted (for scheduling) */
+	bool need_emit : 1;
+
+	/* predicate value - (usually) same for entire block */
+	uint8_t pred : 2;
+
+	/* src */
+	uint8_t src_count;
+	struct ir2_src src[4];
+
+	/* dst */
+	bool is_ssa;
+	union {
+		struct ir2_reg ssa;
+		struct ir2_reg *reg;
+	};
+
+	/* type-specific */
+	union {
+		struct {
+			instr_fetch_opc_t opc : 5;
+			union {
+				struct {
+					uint8_t const_idx;
+					uint8_t const_idx_sel;
+				} vtx;
+				struct {
+					bool is_cube : 1;
+					bool is_rect : 1;
+					uint8_t samp_id;
+				} tex;
+			};
+		} fetch;
+		struct {
+			/* store possible opcs, then we can choose vector/scalar instr */
+			instr_scalar_opc_t scalar_opc : 6;
+			instr_vector_opc_t vector_opc : 5;
+			/* same as nir */
+			uint8_t write_mask : 4;
+			bool saturate : 1;
+
+			/* export idx (-1 no export) */
+			int8_t export;
+
+			/* for scalarized 2 src instruction */
+			uint8_t src1_swizzle;
+		} alu;
+		struct {
+			/* jmp dst block_idx */
+			uint8_t block_idx;
+		} cf;
+	};
+};
+
+struct ir2_sched_instr {
+	uint32_t reg_state[8];
+	struct ir2_instr *instr, *instr_s;
+};
+
+struct ir2_context {
+	struct fd2_shader_stateobj *so;
+
+	unsigned block_idx, pred_idx;
+	uint8_t pred;
+	bool block_has_jump[64];
+
+	unsigned loop_last_block[64];
+	unsigned loop_depth;
+
+	nir_shader *nir;
+
+	/* ssa index of position output */
+	struct ir2_src position;
+
+	/* to translate SSA ids to instruction ids */
+	int16_t ssa_map[1024];
+
+	struct ir2_shader_info *info;
+
+	int prev_export;
+
+	/* RA state */
+	struct ir2_reg* live_regs[64];
+	uint32_t reg_state[256/32]; /* 64*4 bits */
+
+	/* inputs */
+	struct ir2_reg input[16 + 1]; /* 16 + param */
+
+	/* non-ssa regs */
+	struct ir2_reg reg[64];
+	unsigned reg_count;
+
+	struct ir2_instr instr[0x300];
+	unsigned instr_count;
+
+	struct ir2_sched_instr instr_sched[0x180];
+	unsigned instr_sched_count;
+};
+
+void substitutions(struct ir2_context *ctx);
+void late_substitutions(struct ir2_context *ctx);
+
+void assemble(struct ir2_context *ctx);
+
+bool ir2_nir_vectorize(nir_shader * shader);
+bool ir2_nir_lower_scalar(nir_shader * shader);
+void ir2_nir_compile(struct ir2_context *ctx, unsigned variant);
+
+void ra_count_refs(struct ir2_context *ctx);
+void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx,
+	bool export, uint8_t export_writemask);
+void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr);
+void ra_block_free(struct ir2_context *ctx, unsigned block);
+
+/* utils */
+enum {
+	IR2_SWIZZLE_Y = 1 << 0,
+	IR2_SWIZZLE_Z = 2 << 0,
+	IR2_SWIZZLE_W = 3 << 0,
+
+	IR2_SWIZZLE_ZW = 2 << 0 | 2 << 2,
+
+	IR2_SWIZZLE_XYW = 0 << 0 | 0 << 2 | 1 << 4,
+
+	IR2_SWIZZLE_XXXX = 0 << 0 | 3 << 2 | 2 << 4 | 1 << 6,
+	IR2_SWIZZLE_YYYY = 1 << 0 | 0 << 2 | 3 << 4 | 2 << 6,
+	IR2_SWIZZLE_ZZZZ = 2 << 0 | 1 << 2 | 0 << 4 | 3 << 6,
+	IR2_SWIZZLE_WWWW = 3 << 0 | 2 << 2 | 1 << 4 | 0 << 6,
+	IR2_SWIZZLE_WYWW = 3 << 0 | 0 << 2 | 1 << 4 | 0 << 6,
+	IR2_SWIZZLE_XYXY = 0 << 0 | 0 << 2 | 2 << 4 | 2 << 6,
+	IR2_SWIZZLE_ZZXY = 2 << 0 | 1 << 2 | 2 << 4 | 2 << 6,
+	IR2_SWIZZLE_YXZZ = 1 << 0 | 3 << 2 | 0 << 4 | 3 << 6,
+};
+
+#define compile_error(ctx, args...) ({ \
+	printf(args); \
+	assert(0); \
+})
+
+static inline struct ir2_src
+ir2_src(uint16_t num, uint8_t swizzle, enum ir2_src_type type)
+{
+	return (struct ir2_src) {
+		.num = num,
+		.swizzle = swizzle,
+		.type = type
+	};
+}
+
+/* ir2_assemble uses it .. */
+struct ir2_src ir2_zero(struct ir2_context *ctx);
+
+#define ir2_foreach_instr(it, ctx) \
+	for (struct ir2_instr *it = (ctx)->instr; ({ \
+		while (it != &(ctx)->instr[(ctx)->instr_count] && it->type == IR2_NONE) it++; \
+		 it != &(ctx)->instr[(ctx)->instr_count]; }); it++)
+
+#define ir2_foreach_live_reg(it, ctx) \
+	for (struct ir2_reg **__ptr = (ctx)->live_regs, *it; ({ \
+		while (__ptr != &(ctx)->live_regs[64] && *__ptr == NULL) __ptr++; \
+		 __ptr != &(ctx)->live_regs[64] ? (it=*__ptr) : NULL; }); it++)
+
+#define ir2_foreach_avail(it) \
+	for (struct ir2_instr **__instrp = avail, *it; \
+		it = *__instrp,  __instrp != &avail[avail_count]; __instrp++)
+
+#define ir2_foreach_src(it, instr) \
+	for (struct ir2_src *it = instr->src; \
+		 it != &instr->src[instr->src_count]; it++)
+
+/* mask for register allocation
+ * 64 registers with 4 components each = 256 bits
+ */
+/* typedef struct {
+	uint64_t data[4];
+} regmask_t; */
+
+static inline bool mask_isset(uint32_t * mask, unsigned num)
+{
+	return ! !(mask[num / 32] & 1 << num % 32);
+}
+
+static inline void mask_set(uint32_t * mask, unsigned num)
+{
+	mask[num / 32] |= 1 << num % 32;
+}
+
+static inline void mask_unset(uint32_t * mask, unsigned num)
+{
+	mask[num / 32] &= ~(1 << num % 32);
+}
+
+static inline unsigned mask_reg(uint32_t * mask, unsigned num)
+{
+	return mask[num / 8] >> num % 8 * 4 & 0xf;
+}
+
+static inline bool is_export(struct ir2_instr *instr)
+{
+	return instr->type == IR2_ALU && instr->alu.export >= 0;
+}
+
+static inline instr_alloc_type_t export_buf(unsigned num)
+{
+	return num < 32 ? SQ_PARAMETER_PIXEL :
+		num >= 62 ? SQ_POSITION : SQ_MEMORY;
+}
+
+/* component c for channel i */
+static inline unsigned swiz_set(unsigned c, unsigned i)
+{
+	return ((c - i) & 3) << i * 2;
+}
+
+/* get swizzle in channel i */
+static inline unsigned swiz_get(unsigned swiz, unsigned i)
+{
+	return ((swiz >> i * 2) + i) & 3;
+}
+
+static inline unsigned swiz_merge(unsigned swiz0, unsigned swiz1)
+{
+	unsigned swiz = 0;
+	for (int i = 0; i < 4; i++)
+		swiz |= swiz_set(swiz_get(swiz0, swiz_get(swiz1, i)), i);
+	return swiz;
+}
+
+static inline void swiz_merge_p(uint8_t *swiz0, unsigned swiz1)
+{
+	unsigned swiz = 0;
+	for (int i = 0; i < 4; i++)
+		swiz |= swiz_set(swiz_get(*swiz0, swiz_get(swiz1, i)), i);
+	*swiz0 = swiz;
+}
+
+static inline struct ir2_reg * get_reg(struct ir2_instr *instr)
+{
+	return instr->is_ssa ? &instr->ssa : instr->reg;
+}
+
+static inline struct ir2_reg *
+get_reg_src(struct ir2_context *ctx, struct ir2_src *src)
+{
+	switch (src->type) {
+	case IR2_SRC_INPUT:
+		return &ctx->input[src->num];
+	case IR2_SRC_SSA:
+		return &ctx->instr[src->num].ssa;
+	case IR2_SRC_REG:
+		return &ctx->reg[src->num];
+	default:
+		return NULL;
+	}
+}
+
+/* gets a ncomp value for the dst */
+static inline unsigned dst_ncomp(struct ir2_instr *instr)
+{
+	if (instr->is_ssa)
+		return instr->ssa.ncomp;
+
+	assert(instr->type == IR2_ALU);
+
+	unsigned ncomp = 0;
+	for (int i = 0; i < instr->reg->ncomp; i++)
+		ncomp += !!(instr->alu.write_mask & 1 << i);
+	return ncomp;
+}
+
+/* gets a ncomp value for the src registers */
+static inline unsigned src_ncomp(struct ir2_instr *instr)
+{
+	if (instr->type == IR2_FETCH) {
+		switch (instr->fetch.opc) {
+		case VTX_FETCH:
+			return 1;
+		case TEX_FETCH:
+			return instr->fetch.tex.is_cube ? 3 : 2;
+		case TEX_SET_TEX_LOD:
+			return 1;
+		default:
+			assert(0);
+		}
+	}
+
+	switch (instr->alu.scalar_opc) {
+	case PRED_SETEs ... KILLONEs:
+		return 1;
+	default:
+		break;
+	}
+
+	switch (instr->alu.vector_opc) {
+	case DOT2ADDv:
+		return 2;
+	case DOT3v:
+		return 3;
+	case DOT4v:
+	case CUBEv:
+	case PRED_SETE_PUSHv:
+		return 4;
+	default:
+		return dst_ncomp(instr);
+	}
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_ra.c b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c
new file mode 100644
index 0000000000..f37eb36b4b
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_ra.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+#include "ir2_private.h"
+
+/* if an instruction has side effects, we should never kill it */
+static bool has_side_effects(struct ir2_instr *instr)
+{
+	if (instr->type == IR2_CF)
+		return true;
+	else if (instr->type == IR2_FETCH)
+		return false;
+
+	switch (instr->alu.scalar_opc) {
+	case PRED_SETEs ... KILLONEs:
+		return true;
+	default:
+		break;
+	}
+
+	switch (instr->alu.vector_opc) {
+	case PRED_SETE_PUSHv ... KILLNEv:
+		return true;
+	default:
+		break;
+	}
+
+	return instr->alu.export >= 0;
+}
+
+/* mark an instruction as required, and all its sources recursively */
+static void set_need_emit(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+	struct ir2_reg *reg;
+
+	/* don't repeat work already done */
+	if (instr->need_emit)
+		return;
+
+	instr->need_emit = true;
+
+	ir2_foreach_src(src, instr) {
+		switch (src->type) {
+		case IR2_SRC_SSA:
+			set_need_emit(ctx, &ctx->instr[src->num]);
+			break;
+		case IR2_SRC_REG:
+			/* slow ..  */
+			reg = get_reg_src(ctx, src);
+			ir2_foreach_instr(instr, ctx) {
+				if (!instr->is_ssa && instr->reg == reg)
+					set_need_emit(ctx, instr);
+			}
+		default:
+			break;
+		}
+	}
+}
+
+/* get current bit mask of allocated components for a register */
+static unsigned reg_mask(struct ir2_context *ctx, unsigned idx)
+{
+	return ctx->reg_state[idx/8] >> idx%8*4 & 0xf;
+}
+
+static void reg_setmask(struct ir2_context *ctx, unsigned idx, unsigned c)
+{
+	idx = idx * 4 + c;
+	ctx->reg_state[idx/32] |= 1 << idx%32;
+}
+
+static void reg_freemask(struct ir2_context *ctx, unsigned idx, unsigned c)
+{
+	idx = idx * 4 + c;
+	ctx->reg_state[idx/32] &= ~(1 << idx%32);
+}
+
+void ra_count_refs(struct ir2_context *ctx)
+{
+	struct ir2_reg *reg;
+
+	/* mark instructions as needed
+	 * need to do this because "substitutions" pass makes many movs not needed
+	 */
+	ir2_foreach_instr(instr, ctx) {
+		if (has_side_effects(instr))
+			set_need_emit(ctx, instr);
+	}
+
+	/* compute ref_counts */
+	ir2_foreach_instr(instr, ctx) {
+		/* kill non-needed so they can be skipped */
+		if (!instr->need_emit) {
+			instr->type = IR2_NONE;
+			continue;
+		}
+
+		ir2_foreach_src(src, instr) {
+			if (src->type == IR2_SRC_CONST)
+				continue;
+
+			reg = get_reg_src(ctx, src);
+			for (int i = 0; i < src_ncomp(instr); i++)
+				reg->comp[swiz_get(src->swizzle, i)].ref_count++;
+		}
+	}
+}
+
+void ra_reg(struct ir2_context *ctx, struct ir2_reg *reg, int force_idx,
+	bool export, uint8_t export_writemask)
+{
+	/* for export, don't allocate anything but set component layout */
+	if (export) {
+		for (int i = 0; i < 4; i++)
+			reg->comp[i].c = i;
+		return;
+	}
+
+	unsigned idx = force_idx;
+
+	/* TODO: allocate into the same register if theres room
+	 * note: the blob doesn't do it, so verify that it is indeed better
+	 * also, doing it would conflict with scalar mov insertion
+	 */
+
+	/* check if already allocated */
+	for (int i = 0; i < reg->ncomp; i++) {
+		if (reg->comp[i].alloc)
+			return;
+	}
+
+	if (force_idx < 0) {
+		for (idx = 0; idx < 64; idx++) {
+			if (reg_mask(ctx, idx) == 0)
+				break;
+		}
+	}
+	assert(idx != 64); /* TODO ran out of register space.. */
+
+	/* update max_reg value */
+	ctx->info->max_reg = MAX2(ctx->info->max_reg, (int) idx);
+
+	unsigned mask = reg_mask(ctx, idx);
+
+	for (int i = 0; i < reg->ncomp; i++) {
+		/* don't allocate never used values */
+		if (reg->comp[i].ref_count == 0) {
+			reg->comp[i].c = 7;
+			continue;
+		}
+
+		/* TODO */
+		unsigned c = 1 ? i : (ffs(~mask) - 1);
+		mask |= 1 << c;
+		reg->comp[i].c = c;
+		reg_setmask(ctx, idx, c);
+		reg->comp[i].alloc = true;
+	}
+
+	reg->idx = idx;
+	ctx->live_regs[reg->idx] = reg;
+}
+
+/* reduce srcs ref_count and free if needed */
+void ra_src_free(struct ir2_context *ctx, struct ir2_instr *instr)
+{
+	struct ir2_reg *reg;
+	struct ir2_reg_component *comp;
+
+	ir2_foreach_src(src, instr) {
+		if (src->type == IR2_SRC_CONST)
+			continue;
+
+		reg = get_reg_src(ctx, src);
+		/* XXX use before write case */
+
+		for (int i = 0; i < src_ncomp(instr); i++) {
+			comp = &reg->comp[swiz_get(src->swizzle, i)];
+			if (!--comp->ref_count && reg->block_idx_free < 0) {
+				reg_freemask(ctx, reg->idx, comp->c);
+				comp->alloc = false;
+			}
+		}
+	}
+}
+
+/* free any regs left for a block */
+void ra_block_free(struct ir2_context *ctx, unsigned block)
+{
+	ir2_foreach_live_reg(reg, ctx) {
+		if (reg->block_idx_free != block)
+			continue;
+
+		for (int i = 0; i < reg->ncomp; i++) {
+			if (!reg->comp[i].alloc) /* XXX should never be true? */
+				continue;
+
+			reg_freemask(ctx, reg->idx, reg->comp[i].c);
+			reg->comp[i].alloc = false;
+		}
+		ctx->live_regs[reg->idx] = NULL;
+	}
+}
diff --git a/src/gallium/drivers/freedreno/a2xx/ir2_substitutions.c b/src/gallium/drivers/freedreno/a2xx/ir2_substitutions.c
new file mode 100644
index 0000000000..c878ede665
--- /dev/null
+++ b/src/gallium/drivers/freedreno/a2xx/ir2_substitutions.c
@@ -0,0 +1,226 @@
+/*
+ * Copyright (C) 2018 Jonathan Marek <jonathan at marek.ca>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Jonathan Marek <jonathan at marek.ca>
+ */
+
+#include "ir2_private.h"
+
+static bool is_single_mov(struct ir2_instr *instr)
+{
+	return instr->type == IR2_ALU && instr->alu.vector_opc == MAXv &&
+		instr->src_count == 1;
+}
+
+static void substitute(struct ir2_src *src, struct ir2_src b)
+{
+	src->num = b.num;
+	src->type = b.type;
+	src->swizzle = swiz_merge(b.swizzle, src->swizzle);
+	if (!src->abs) /* if we have abs we don't care about previous negate */
+        src->negate ^= b.negate;
+	src->abs |= b.abs;
+}
+
+/* substitutions: replace src regs when they refer to a mov instruction
+ * example:
+ *	ALU:      MAXv    R7 = C7, C7
+ *	ALU:      MULADDv R7 = R7, R10, R0.xxxx
+ * becomes:
+ *	ALU:      MULADDv R7 = C7, R10, R0.xxxx
+ */
+void substitutions(struct ir2_context *ctx)
+{
+	struct ir2_instr *p;
+
+	ir2_foreach_instr(instr, ctx) {
+		ir2_foreach_src(src, instr) {
+			/* loop to substitute recursively */
+			do {
+                if (src->type != IR2_SRC_SSA)
+					break;
+
+				p = &ctx->instr[src->num];
+				/* don't work across blocks to avoid possible issues */
+				if (p->block_idx != instr->block_idx)
+					break;
+
+				if (!is_single_mov(p))
+					break;
+
+				/* cant apply abs to const src, const src only for alu */
+				if (p->src[0].type == IR2_SRC_CONST &&
+					(src->abs || instr->type != IR2_ALU))
+					break;
+
+				substitute(src, p->src[0]);
+			} while (1);
+		}
+	}
+}
+
+/* late substitution: redirect directly to export when possible
+ * in the substitution pass we bypass any mov instructions related
+ * to the src registers, but for exports for need something different
+ * example:
+ *	ALU:      MAXv    R3.x___ = C9.x???, C9.x???
+ *	ALU:      MAXv    R3._y__ = R0.?x??, C8.?x??
+ *	ALU:      MAXv    export0 = R3.yyyx, R3.yyyx
+ * becomes:
+ *	ALU:      MAXv    export0.___w = C9.???x, C9.???x
+ *	ALU:      MAXv    export0.xyz_ = R0.xxx?, C8.xxx?
+ *
+ */
+void late_substitutions(struct ir2_context *ctx)
+{
+	struct ir2_instr *c[4], *ins[4];
+	struct ir2_src *src;
+	struct ir2_reg *reg;
+	unsigned ncomp;
+
+	ir2_foreach_instr(instr, ctx) {
+		if (!is_export(instr)) /* TODO */
+			continue;
+
+		if (!is_single_mov(instr))
+			continue;
+
+		src = &instr->src[0];
+
+		/* TODO add logic for this */
+		if (src->negate || src->abs)
+			continue;
+
+		if (src->type == IR2_SRC_INPUT || src->type == IR2_SRC_CONST)
+			continue;
+
+		reg = get_reg_src(ctx, src);
+		ncomp = dst_ncomp(instr);
+
+		unsigned reswiz[4] = {};
+		unsigned num_instr = 0;
+
+		/* fill array c with pointers to instrs that write each component */
+		if (src->type == IR2_SRC_SSA) {
+			struct ir2_instr *instr = &ctx->instr[src->num];
+
+			if (instr->type != IR2_ALU)
+				continue;
+
+			for (int i = 0; i < ncomp; i++)
+                c[i] = instr;
+
+			ins[num_instr++] = instr;
+			reswiz[0] = src->swizzle;
+		} else {
+			bool ok = true;
+			unsigned write_mask = 0;
+
+			ir2_foreach_instr(instr, ctx) {
+				if (instr->is_ssa || instr->reg != reg)
+					continue;
+
+				/* set by non-ALU */
+				if (instr->type != IR2_ALU) {
+					ok = false;
+					break;
+				}
+
+				/* component written more than once */
+				if (write_mask & instr->alu.write_mask) {
+					ok = false;
+					break;
+				}
+
+				write_mask |= instr->alu.write_mask;
+
+				/* src pointers for components */
+                for (int i = 0, j = 0; i < 4; i++) {
+					unsigned k = swiz_get(src->swizzle, i);
+					if (instr->alu.write_mask & 1 << k) {
+						c[i] = instr;
+
+						/* reswiz = compressed src->swizzle */
+						unsigned x = 0;
+						for (int i = 0; i < k; i++)
+							x += !!(instr->alu.write_mask & 1 << i);
+
+						assert(src->swizzle || x == j);
+						reswiz[num_instr] |= swiz_set(x, j++);
+					}
+				}
+				ins[num_instr++] = instr;
+			}
+			if (!ok)
+				continue;
+		}
+
+		bool redirect = true;
+
+		/* must all be in same block */
+		for (int i = 0; i < ncomp; i++)
+			redirect &= (c[i]->block_idx == instr->block_idx);
+
+		/* no other instr using the value */
+		ir2_foreach_instr(p, ctx) {
+			if (p == instr)
+				continue;
+			ir2_foreach_src(src, p)
+				redirect &= reg != get_reg_src(ctx, src);
+		}
+
+		if (!redirect)
+			continue;
+
+		/* redirect the instructions writing to the register */
+		for (int i = 0; i < num_instr; i++) {
+			struct ir2_instr *p = ins[i];
+
+			p->alu.export = instr->alu.export;
+			p->alu.write_mask = 0;
+			p->is_ssa = true;
+			p->ssa.ncomp = 0;
+			memset(p->ssa.comp, 0, sizeof(p->ssa.comp));
+
+			switch (instr->alu.vector_opc) {
+			case PRED_SETE_PUSHv ... PRED_SETGTE_PUSHv:
+			case DOT2ADDv:
+			case DOT3v:
+			case DOT4v:
+			case CUBEv:
+				continue;
+			default:
+				break;
+			}
+			ir2_foreach_src(s, p)
+				swiz_merge_p(&s->swizzle, reswiz[i]);
+		}
+
+		for (int i = 0; i < ncomp; i++) {
+			c[i]->alu.write_mask |= (1 << i);
+			c[i]->ssa.ncomp++;
+		}
+		instr->type = IR2_NONE;
+		instr->need_emit = false;
+	}
+}
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 01038d3091..a70a228f11 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -56,14 +56,6 @@ struct fd_texture_stateobj {
 
 struct fd_program_stateobj {
 	void *vp, *fp;
-
-	/* rest only used by fd2.. split out: */
-	uint8_t num_exports;
-	/* Indexed by semantic name or TGSI_SEMANTIC_COUNT + semantic index
-	 * for TGSI_SEMANTIC_GENERIC.  Special vs exports (position and point-
-	 * size) are not included in this
-	 */
-	uint8_t export_linkage[63];
 };
 
 struct fd_constbuf_stateobj {
diff --git a/src/gallium/drivers/freedreno/freedreno_program.c b/src/gallium/drivers/freedreno/freedreno_program.c
index 989ccd1838..3fa09ce0c4 100644
--- a/src/gallium/drivers/freedreno/freedreno_program.c
+++ b/src/gallium/drivers/freedreno/freedreno_program.c
@@ -129,15 +129,14 @@ void fd_prog_init(struct pipe_context *pctx)
 	pctx->bind_fs_state = fd_fp_state_bind;
 	pctx->bind_vs_state = fd_vp_state_bind;
 
-	// XXX for now, let a2xx keep it's own hand-rolled shaders
-	// for solid and blit progs:
-	if (ctx->screen->gpu_id < 300)
-		return;
-
 	ctx->solid_prog.fp = assemble_tgsi(pctx, solid_fp, true);
 	ctx->solid_prog.vp = assemble_tgsi(pctx, solid_vp, false);
 	ctx->blit_prog[0].vp = assemble_tgsi(pctx, blit_vp, false);
 	ctx->blit_prog[0].fp = fd_prog_blit(pctx, 1, false);
+
+	if (ctx->screen->gpu_id < 300)
+		return;
+
 	for (i = 1; i < ctx->screen->max_rts; i++) {
 		ctx->blit_prog[i].vp = ctx->blit_prog[0].vp;
 		ctx->blit_prog[i].fp = fd_prog_blit(pctx, i + 1, false);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index a55403804b..c7f4d9eca6 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -58,6 +58,7 @@
 
 
 #include "ir3/ir3_nir.h"
+#include "a2xx/ir2.h"
 
 /* XXX this should go away */
 #include "state_tracker/drm_driver.h"
@@ -189,7 +190,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_SWIZZLE:
 	case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
 	case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
 	case PIPE_CAP_SEAMLESS_CUBE_MAP:
 	case PIPE_CAP_VERTEX_COLOR_UNCLAMPED:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
@@ -203,6 +203,12 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_INVALIDATE_BUFFER:
 		return 1;
 
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+		return is_ir3(screen);
+
+	case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+		return !is_ir3(screen);
+
 	case PIPE_CAP_VERTEXID_NOBASE:
 		return is_a3xx(screen) || is_a4xx(screen);
 
@@ -505,16 +511,9 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen,
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
 		return 16;
 	case PIPE_SHADER_CAP_PREFERRED_IR:
-		if (is_ir3(screen))
-			return PIPE_SHADER_IR_NIR;
-		return PIPE_SHADER_IR_TGSI;
+		return PIPE_SHADER_IR_NIR;
 	case PIPE_SHADER_CAP_SUPPORTED_IRS:
-		if (is_ir3(screen)) {
-			return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI);
-		} else {
-			return (1 << PIPE_SHADER_IR_TGSI);
-		}
-		return 0;
+		return (1 << PIPE_SHADER_IR_NIR) | (1 << PIPE_SHADER_IR_TGSI);
 	case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
 		return 32;
 	case PIPE_SHADER_CAP_SCALAR_ISA:
@@ -645,7 +644,7 @@ fd_get_compiler_options(struct pipe_screen *pscreen,
 	if (is_ir3(screen))
 		return ir3_get_compiler_options(screen->compiler);
 
-	return NULL;
+	return ir2_get_compiler_options();
 }
 
 boolean
diff --git a/src/gallium/drivers/freedreno/meson.build b/src/gallium/drivers/freedreno/meson.build
index e9fd26b6ff..b61e503c91 100644
--- a/src/gallium/drivers/freedreno/meson.build
+++ b/src/gallium/drivers/freedreno/meson.build
@@ -90,8 +90,6 @@ files_libfreedreno = files(
   'a2xx/disasm-a2xx.c',
   'a2xx/fd2_blend.c',
   'a2xx/fd2_blend.h',
-  'a2xx/fd2_compiler.c',
-  'a2xx/fd2_compiler.h',
   'a2xx/fd2_context.c',
   'a2xx/fd2_context.h',
   'a2xx/fd2_draw.c',
@@ -115,8 +113,14 @@ files_libfreedreno = files(
   'a2xx/fd2_zsa.c',
   'a2xx/fd2_zsa.h',
   'a2xx/instr-a2xx.h',
-  'a2xx/ir-a2xx.c',
-  'a2xx/ir-a2xx.h',
+  'a2xx/ir2.c',
+  'a2xx/ir2_nir.c',
+  'a2xx/ir2_nir_lower_scalar.c',
+  'a2xx/ir2_substitutions.c',
+  'a2xx/ir2_ra.c',
+  'a2xx/ir2_assemble.c',
+  'a2xx/ir2_private.h',
+  'a2xx/ir2.h',
   'a3xx/a3xx.xml.h',
   'a3xx/fd3_blend.c',
   'a3xx/fd3_blend.h',
-- 
2.17.1