[Mesa-dev] [PATCH 2/2] panfrost: Implement Midgard shader toolchain

Wed Jan 30 05:43:44 UTC 2019

This patch implements the free Midgard shader toolchain: the assembler,
the disassembler, and the NIR-based compiler. The assembler is a
standalone inaccessible Python script for reference purposes. The
disassembler and the compiler are implemented in C, accessible via the
standalone `midgard_compiler` binary. Later patches will use these
interfaces from the driver for online compilation.

Signed-off-by: Alyssa Rosenzweig <alyssa at rosenzweig.io>
---
 src/gallium/drivers/panfrost/meson.build      |   44 +-
 .../drivers/panfrost/midgard/assemble.py      |  643 +++
 .../drivers/panfrost/midgard/cmdline.c        |  145 +
 .../drivers/panfrost/midgard/cppwrap.cpp      |    9 +
 .../drivers/panfrost/midgard/disassemble.c    |  986 +++++
 .../drivers/panfrost/midgard/disassemble.h    |    2 +
 .../drivers/panfrost/midgard/helpers.h        |  236 ++
 .../drivers/panfrost/midgard/midgard-parse.h  |   70 +
 .../drivers/panfrost/midgard/midgard.h        |  473 +++
 .../panfrost/midgard/midgard_compile.c        | 3621 +++++++++++++++++
 .../panfrost/midgard/midgard_compile.h        |   80 +
 .../drivers/panfrost/midgard/midgard_nir.h    |    5 +
 .../panfrost/midgard/midgard_nir_algebraic.py |   71 +
 13 files changed, 6383 insertions(+), 2 deletions(-)
 create mode 100644 src/gallium/drivers/panfrost/midgard/assemble.py
 create mode 100644 src/gallium/drivers/panfrost/midgard/cmdline.c
 create mode 100644 src/gallium/drivers/panfrost/midgard/cppwrap.cpp
 create mode 100644 src/gallium/drivers/panfrost/midgard/disassemble.c
 create mode 100644 src/gallium/drivers/panfrost/midgard/disassemble.h
 create mode 100644 src/gallium/drivers/panfrost/midgard/helpers.h
 create mode 100644 src/gallium/drivers/panfrost/midgard/midgard-parse.h
 create mode 100644 src/gallium/drivers/panfrost/midgard/midgard.h
 create mode 100644 src/gallium/drivers/panfrost/midgard/midgard_compile.c
 create mode 100644 src/gallium/drivers/panfrost/midgard/midgard_compile.h
 create mode 100644 src/gallium/drivers/panfrost/midgard/midgard_nir.h
 create mode 100644 src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py

diff --git a/src/gallium/drivers/panfrost/meson.build b/src/gallium/drivers/panfrost/meson.build
index fdf66c0213..f4dec42ad3 100644
--- a/src/gallium/drivers/panfrost/meson.build
+++ b/src/gallium/drivers/panfrost/meson.build
@@ -23,6 +23,10 @@ files_panfrost = files(
   'pan_public.h',
   'pan_screen.c',
   'pan_screen.h',
+
+  'midgard/midgard_compile.c',
+  'midgard/cppwrap.cpp',
+  'midgard/disassemble.c',
 )
 
 inc_panfrost = [
@@ -32,12 +36,25 @@ inc_panfrost = [
   inc_drm_uapi,
   inc_include,
   inc_src,
-  include_directories('include')
+  include_directories('include'),
+  include_directories('midgard'),
 ]
 
+midgard_nir_algebraic_c = custom_target(
+  'midgard_nir_algebraic.c',
+  input : 'midgard/midgard_nir_algebraic.py',
+  output : 'midgard_nir_algebraic.c',
+  command : [
+    prog_python, '@INPUT@',
+    '-p', join_paths(meson.source_root(), 'src/compiler/nir/'),
+  ],
+  capture : true,
+  depend_files : nir_algebraic_py,
+)
+
 libpanfrost = static_library(
   'panfrost',
-  [files_panfrost],
+  [files_panfrost, midgard_nir_algebraic_c],
   dependencies: [
     dep_thread,
     idep_nir
@@ -50,3 +67,26 @@ driver_panfrost = declare_dependency(
   compile_args : ['-DGALLIUM_PANFROST', '-Wno-pointer-arith'],
   link_with : [libpanfrost, libpanfrostwinsys],
 )
+
+files_midgard = files(
+  'midgard/midgard_compile.c',
+  'midgard/cppwrap.cpp',
+  'midgard/disassemble.c',
+  'midgard/cmdline.c',
+)
+
+midgard_compiler = executable(
+  'midgard_compiler',
+  [files_midgard, midgard_nir_algebraic_c],
+  include_directories : inc_panfrost,
+  dependencies : [
+    dep_thread,
+    idep_nir
+  ],
+  link_with : [
+    libgallium,
+    libglsl_standalone,
+    libmesa_util
+  ],
+  build_by_default : true
+)
diff --git a/src/gallium/drivers/panfrost/midgard/assemble.py b/src/gallium/drivers/panfrost/midgard/assemble.py
new file mode 100644
index 0000000000..8088934e1d
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/assemble.py
@@ -0,0 +1,643 @@
+"""
+Copyright (C) 2018 Alyssa Rosenzweig
+Copyright (c) 2013 Connor Abbott (connor at abbott.cx)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import pprint
+import struct
+
+program = []
+
+# Definitions from cwabbott's tools
+
+t6xx_alu_ops = {
+    "fadd":  0x10,
+    "fmul":  0x14,
+    "fmin":  0x28,
+    "fmax":  0x2C,
+    "fmov":  0x30,
+    "ffloor":  0x36,
+    "fceil":  0x37,
+    "fdot3":  0x3C,
+    "fdot3r":  0x3D,
+    "fdot4":  0x3E,
+    "freduce":  0x3F,
+    "iadd":  0x40,
+    "isub":  0x46,
+    "imul":  0x58,
+    "imov":  0x7B,
+    "feq":  0x80,
+    "fne":  0x81,
+    "flt":  0x82,
+    "fle":  0x83,
+    "f2i":  0x99,
+    "f2u8":  0x9C,
+    "u2f": 0xBC,
+    "ieq":  0xA0,
+    "ine":  0xA1,
+    "ilt":  0xA4,
+    "ile":  0xA5,
+    "iand": 0x70,
+    "ior": 0x71,
+    "inot": 0x72,
+    "iandnot": 0x74,
+    "ixor": 0x76,
+    "ball":  0xA9,
+    "bany":  0xB1,
+    "i2f":  0xB8,
+    "csel":  0xC5,
+    "fatan_pt2":  0xE8,
+    "frcp":  0xF0,
+    "frsqrt":  0xF2,
+    "fsqrt":  0xF3,
+    "fexp2":  0xF4,
+    "flog2":  0xF5,
+    "fsin":  0xF6,
+    "fcos":  0xF7,
+    "fatan2_pt1":  0xF9,
+}
+
+t6xx_alu_bits = {
+        "vmul": 17,
+        "sadd": 19,
+        "vadd": 21,
+        "smul": 23,
+        "lut": 25,
+        "br": 26,
+        "branch": 27,
+        "constants": 32
+}
+
+t6xx_alu_size_bits = {
+        "vmul": 48,
+        "sadd": 32,
+        "vadd": 48,
+        "smul": 32,
+        "lut": 48,
+        "br": 16,
+        "branch": 48
+}
+
+t6xx_outmod = {
+        "none": 0,
+        "pos": 1,
+        "int": 2,
+        "sat": 3
+}
+
+t6xx_reg_mode = {
+    "quarter": 0,
+    "half": 1,
+    "full": 2,
+    "double": 3
+}
+
+t6xx_dest_override = {
+    "lower": 0,
+    "upper": 1,
+    "none": 2
+}
+
+t6xx_load_store_ops = {
+    "ld_st_noop":  0x03,
+    "ld_attr_16":  0x95,
+    "ld_attr_32":  0x94,
+    "ld_vary_16":  0x99,
+    "ld_vary_32":  0x98,
+    "ld_uniform_16":  0xAC,
+    "ld_uniform_32":  0xB0,
+    "st_vary_16":  0xD5,
+    "st_vary_32":  0xD4,
+    "ld_color_buffer_8": 0xBA
+}
+
+t6xx_tag = {
+        "texture": 0x3,
+        "load_store": 0x5,
+        "alu4": 0x8,
+        "alu8": 0x9,
+        "alu12": 0xA,
+        "alu16": 0xB,
+}
+
+def is_tag_alu(tag):
+    return (tag >= t6xx_tag["alu4"]) and (tag <= t6xx_tag["alu16"])
+
+# Just an enum
+
+ALU = 0
+LDST = 1
+TEXTURE = 2
+
+# Constant types supported, mapping the constant prefix to the Python format
+# string and the coercion function
+
+constant_types = {
+        "f": ("f", float),
+        "h": ("e", float),
+        "i": ("i", int),
+        "s": ("h", int)
+}
+
+compact_branch_op = {
+        "jump": 1,
+        "branch": 2,
+        "discard": 4,
+        "write": 7
+}
+
+branch_condition = {
+        "false": 1,
+        "true": 2,
+        "always": 3,
+}
+
+# TODO: What else?
+
+texture_op = {
+        "normal": 0x11,
+        "texelfetch": 0x14
+}
+
+texture_fmt = {
+        "2d": 0x02,
+        "3d": 0x03
+}
+	
+with open(sys.argv[1], "r") as f:
+    for ln in f:
+        space = ln.strip().split(" ")
+
+        instruction = space[0]
+        rest = " ".join(space[1:])
+
+        arguments = [s.strip() for s in rest.split(",")]
+        program += [(instruction, arguments)]
+
+swizzle_component = {
+        "x": 0,
+        "y": 1,
+        "z": 2,
+        "w": 3
+}
+
+def decode_reg_name(reg_name):
+    ireg = 0
+    upper = False
+    half = False
+
+    if reg_name[0] == 'r':
+        ireg = int(reg_name[1:])
+    elif reg_name[0] == 'h':
+        rreg = int(reg_name[2:])
+
+        # Decode half-register into its full register's half
+        ireg = rreg >> 1
+        upper = rreg & 1
+        half = True
+    else:
+        # Special case for load/store addresses
+        ireg = int(reg_name)
+
+    return (ireg, half, upper)
+
+def standard_swizzle_from_parts(swizzle_parts):
+    swizzle_s = swizzle_parts[1] if len(swizzle_parts) > 1 else "xyzw"
+
+    swizzle = 0
+    for (i, c) in enumerate(swizzle_s):
+        swizzle |= swizzle_component[c] << (2 * i)
+
+    return swizzle
+
+def mask_from_parts(mask_parts, large_mask):
+    mask_s = mask_parts[1] if len(mask_parts) > 1 else "xyzw"
+
+    if large_mask:
+        mask = sum([(3 << (2*swizzle_component[c]) if c in mask_s else 0) for c in "xyzw"])
+    else:
+        mask = sum([(1 << swizzle_component[c] if c in mask_s else 0) for c in "xyzw"])
+
+    return (mask, mask_s)
+
+def decode_reg(reg):
+    if reg[0] == "#":
+        # Not actually a register, instead an immediate float
+        return (True, struct.unpack("H", struct.pack("e", float(reg[1:])))[0], 0, 0, 0, 0)
+
+    # Function call syntax used in abs() modifier
+    if reg[-1] == ')':
+        reg = reg[:-1]
+
+    swizzle_parts = reg.split(".")
+
+    reg_name = swizzle_parts[0]
+
+    modifiers = 0
+
+    if reg_name[0] == '-':
+        modifiers |= 2
+        reg_name = reg_name[1:]
+
+    if reg_name[0] == 'a':
+        modifiers |= 1
+        reg_name = reg_name[len("abs("):]
+    
+    (ireg, half, upper) = decode_reg_name(reg_name)
+
+    return (False, ireg, standard_swizzle_from_parts(swizzle_parts), half, upper, modifiers)
+
+def decode_masked_reg(reg, large_mask):
+    mask_parts = reg.split(".")
+
+    reg_name = mask_parts[0]
+    (ireg, half, upper) = decode_reg_name(reg_name)
+    (mask, mask_s) = mask_from_parts(mask_parts, large_mask)
+
+    component = max([0] + [swizzle_component[c] for c in "xyzw" if c in mask_s])
+
+    return (ireg, mask, component, half, upper)
+
+# TODO: Fill these in XXX
+
+# Texture pipeline registers in r28-r29
+TEXTURE_BASE = 28
+
+def decode_texture_reg_number(reg):
+    r = reg.split(".")[0]
+
+    if r[0] == "r":
+        return (True, int(r[1:]) - TEXTURE_BASE, 0)
+    else:
+        no = int(r[2:])
+        return (False, (no >> 1) - TEXTURE_BASE, no & 1)
+
+def decode_texture_reg(reg):
+    (full, select, upper) = decode_texture_reg_number(reg)
+
+    # Swizzle mandatory for texture registers, afaict
+    swizzle = reg.split(".")[1]
+    swizzleL = swizzle_component[swizzle[0]]
+    swizzleR = swizzle_component[swizzle[1]]
+
+    return (full, select, upper, swizzleR, swizzleL)
+
+def decode_texture_out_reg(reg):
+    (full, select, upper) = decode_texture_reg_number(reg)
+    (mask, _) = mask_from_parts(reg.split("."), False)
+
+    return (full, select, upper, mask)
+
+instruction_stream = []
+
+for p in program:
+    ins = p[0]
+    arguments = p[1]
+
+    family = ins_mod = ins.split(".")[0]
+    ins_op = (ins + ".").split(".")[1]
+
+    ins_outmod = (ins + "." + ".").split(".")[2]
+    
+    try:
+        out_mod = t6xx_outmod[ins_outmod]
+    except:
+        out_mod = 0
+
+    if ins in t6xx_load_store_ops:
+        op = t6xx_load_store_ops[ins]
+        (reg, mask, component, half, upper) = decode_masked_reg(p[1][0], False)
+        (immediate, address, swizzle, half, upper, modifiers) = decode_reg(p[1][1])
+        unknown = int(p[1][2], 16)
+        b = (op << 0) | (reg << 8) | (mask << 13) | (swizzle << 17) | (unknown << 25) | (address << 51)
+        instruction_stream += [(LDST, b)]
+    elif ins_op in t6xx_alu_ops:
+        op = t6xx_alu_ops[ins_op]
+
+        (reg_out, mask, out_component, half0, upper0) = decode_masked_reg(p[1][0], True)
+        (_, reg_in1, swizzle1, half1, upper1, mod1) = decode_reg(p[1][1])
+        (immediate, reg_in2, swizzle2, half2, upper2, mod2) = decode_reg(p[1][2])
+
+        if immediate:
+            register_word = (reg_in1 << 0) | ((reg_in2 >> 11) << 5) | (reg_out << 10) | (1 << 15)
+        else:
+            register_word = (reg_in1 << 0) | (reg_in2 << 5) | (reg_out << 10) 
+
+        if ins_mod in ["vadd", "vmul", "lut"]:
+            io_mode = t6xx_reg_mode["half" if half0 else "full"]
+            repsel = 0
+            i1half = half1
+            i2block = 0
+            output_override = 2 # NORMAL, TODO
+            wr_mask = 0
+
+            if (ins_outmod == "quarter"):
+                io_mode = t6xx_reg_mode["quarter"]
+
+            if half0:
+                # TODO: half actually
+                repsel = 2 * upper1 
+            else:
+                repsel = upper1
+
+            if half0:
+                # Rare case...
+
+                (_, halfmask, _, _, _) = decode_masked_reg(p[1][0], False)
+                wr_mask = halfmask
+            else:
+                wr_mask = mask
+
+
+            if immediate:
+                # Inline constant: lower 11 bits
+
+                i2block = ((reg_in2 & 0xFF) << 3) | ((reg_in2 >> 8) & 0x7)
+            else:
+                if half0:
+                    # TODO: replicate input 2 if half
+                    pass
+                else:
+                    # TODO: half selection
+                    i2block = upper2 | (half2 << 2)
+
+                i2block |= swizzle2 << 3
+
+            # Extra modifier for some special cased stuff
+            try:
+                special = ins.split(".")[3]
+
+                if special == "low":
+                    output_override = 0 # low
+                elif special == "fulllow":
+                    # TODO: Not really a special case, just a bug?
+                    io_mode = t6xx_reg_mode["full"]
+                    output_override = 0 #low
+                    wr_mask = 0xFF
+            except:
+                pass
+
+            instruction_word = (op << 0) | (io_mode << 8) | (mod1 << 10) | (repsel << 12) | (i1half << 14) | (swizzle1 << 15) | (mod2 << 23) | (i2block << 25) | (output_override << 36) | (out_mod << 38) | (wr_mask << 40)
+        elif ins_mod in ["sadd", "smul"]:
+            # TODO: What are these?
+            unknown2 = 0
+            unknown3 = 0
+
+            i1comp_block = 0
+
+            if half1:
+                i1comp_block = swizzle1 | (upper1 << 2)
+            else:
+                i1comp_block = swizzle1 << 1
+
+            i2block = 0
+
+            if immediate:
+                # Inline constant is splattered in a... bizarre way
+
+                i2block = (((reg_in2 >> 9) & 3) << 0) | (((reg_in2 >> 8) & 1) << 2) | (((reg_in2 >> 5) & 7) << 3) | (((reg_in2 >> 0) & 15) << 6)
+            else:
+                # TODO: half register
+                swizzle2 = (swizzle2 << 1) & 0x1F
+                i2block = (mod2 << 0) | ((not half2) << 2) | (swizzle2 << 3) | (unknown2 << 5)
+
+            outcomp_block = 0
+            
+            if True:
+                outcomp_block = out_component << 1
+            else:
+                # TODO: half register
+                pass
+
+            instruction_word = (op << 0) | (mod1 << 8) | ((not half1) << 10) | (i1comp_block << 11) | (i2block << 14) | (unknown3 << 25) | (out_mod << 26) | ((not half0) << 28) | (outcomp_block) << 29
+
+        else:
+            instruction_word = op
+
+        instruction_stream += [(ALU, ins_mod, register_word, instruction_word)]
+    elif family == "texture":
+        # Texture ops use long series of modifiers to describe their needed
+        # capabilities, seperated by dots. Decode them here
+        parts = ins.split(".")
+
+        # First few modifiers are fixed, like an instruction name
+        tex_op = parts[1]
+        tex_fmt = parts[2]
+
+        # The remaining are variable, but strictly ordered
+        parts = parts[3:]
+
+        op = texture_op[tex_op]
+
+        # Some bits are defined directly in the modifier list
+        shadow = "shadow" in parts
+        cont = "cont" in parts
+        last = "last" in parts
+        has_filter = "raw" not in parts
+
+        # The remaining need order preserved since they have their own arguments
+        argument_parts = [part for part in parts if part not in ["shadow", "cont", "last", "raw"]]
+
+        bias_lod = 0
+
+        for argument, part in zip(argument_parts, arguments[4:]):
+            if argument == "bias":
+                bias_lod = int(float(part) * 256)
+            else:
+                print("Unknown argument: " + str(argument))
+
+        fmt = texture_fmt[tex_fmt]
+        has_offset = 0
+
+        magic1 = 1 # IDEK
+        magic2 = 2 # Where did this even come from?!
+
+        texture_handle = int(arguments[1][len("texture"):])
+        
+        sampler_parts = arguments[2].split(".")
+        sampler_handle = int(sampler_parts[0][len("sampler"):])
+        swizzle0 = standard_swizzle_from_parts(sampler_parts)
+
+        (full0, select0, upper0, mask0) = decode_texture_out_reg(arguments[0])
+        (full1, select1, upper1, swizzleR1, swizzleL1) = decode_texture_reg(arguments[3])
+
+        tex = (op << 0) | (shadow << 6) | (cont << 8) | (last << 9) | (fmt << 10) | (has_offset << 15) | (has_filter << 16) | (select1 << 17) | (upper1 << 18) | (swizzleL1 << 19) | (swizzleR1 << 21) | (0 << 23) | (magic2 << 25) | (full0 << 29) | (magic1 << 30) | (select0 << 32) | (upper0 << 33) | (mask0 << 34) | (swizzle0 << 40) | (bias_lod << 72) | (texture_handle << 88) | (sampler_handle << 104)
+
+        instruction_stream += [(TEXTURE, tex)]
+    elif family == "br":
+        cond = ins.split(".")[2]
+        condition = branch_condition[cond]
+        bop = compact_branch_op[ins_op]
+
+        offset = int(arguments[0].split("->")[0])
+
+        # 2's complement and chill
+        if offset < 0:
+            offset = (1 << 7) - abs(offset)
+
+        # Find where we're going
+        dest_tag = int(arguments[0].split("->")[1])
+
+        br = (bop << 0) | (dest_tag << 3) | (offset << 7) | (condition << 14)
+
+        # TODO: Unconditional branch encoding
+
+        instruction_stream += [(ALU, "br", None, br)]
+    elif ins[1:] == "constants":
+        if ins[0] not in constant_types:
+            print("Unknown constant type " + str(constant_type))
+            break
+
+        (fmt, cast) = constant_types[ins[0]]
+
+        encoded = [struct.pack(fmt, cast(f)) for f in p[1]]
+
+        consts = bytearray()
+        for c in encoded:
+            consts += c
+
+        # consts must be exactly 4 quadwords, so pad with zeroes if necessary
+        consts += bytes(4*4 - len(consts))
+
+        instruction_stream += [(ALU, "constants", consts)]
+
+# Emit from instruction stream
+instructions = []
+index = 0
+while index < len(instruction_stream):
+    output_stream = bytearray()
+    ins = instruction_stream[index]
+    tag = ins[0]
+
+    can_prefetch = index + 1 < len(instruction_stream)
+    succeeding = None
+
+    if tag == LDST:
+        succeeding = instruction_stream[index + 1] if can_prefetch else None
+        parta = ins[1]
+        partb = None
+
+        if succeeding and succeeding[0] == LDST:
+            partb = succeeding[1]
+            index += 1
+        else:
+            partb = parta
+            parta = t6xx_load_store_ops["ld_st_noop"]
+
+        tag8 = t6xx_tag["load_store"]
+
+        ins = (partb << 68) | (parta << 8) | tag8
+        output_stream += (ins.to_bytes(16, "little"))
+    elif tag == TEXTURE:
+        tag8 = t6xx_tag["texture"] 
+        ins = (ins[1] << 8) | tag8
+
+        output_stream += (ins.to_bytes(16, "little"))
+    elif tag == ALU:
+        # TODO: Combining ALU ops
+
+        emit_size = 4 # 32-bit tag always emitted
+
+        tag = 0
+        register_words = bytearray()
+        body_words = bytearray()
+        constant_words = None
+
+        last_alu_bit = 0
+
+        # Iterate through while there are ALU tags in strictly ascending order
+        while index < len(instruction_stream) and instruction_stream[index][0] == ALU and t6xx_alu_bits[instruction_stream[index][1]] > last_alu_bit:
+            ins = instruction_stream[index]
+
+            bit = t6xx_alu_bits[ins[1]]
+            last_alu_bit = bit
+
+            if ins[1] == "constants":
+                constant_words = ins[2]
+            else:
+                # Flag for the used part of the GPU
+                tag |= 1 << bit
+
+                # 16-bit register word, if present
+                if ins[2] is not None:
+                    register_words += (ins[2].to_bytes(2, "little"))
+                    emit_size += 2
+
+                size = int(t6xx_alu_size_bits[ins[1]] / 8)
+                body_words += (ins[3].to_bytes(size, "little"))
+                emit_size += size
+
+            index += 1
+
+        index -= 1 # fix off by one, from later loop increment
+
+        # Pad to nearest multiple of 4 words
+        padding = (16 - (emit_size & 15)) if (emit_size & 15) else 0
+        emit_size += padding
+
+        # emit_size includes constants
+        if constant_words:
+            emit_size += len(constant_words)
+
+        # Calculate tag given size
+        words = emit_size >> 2
+        tag |= t6xx_tag["alu" + str(words)]
+
+        # Actually emit, now that we can
+        output_stream += tag.to_bytes(4, "little")
+        output_stream += register_words
+        output_stream += body_words
+        output_stream += bytes(padding)
+
+        if constant_words:
+            output_stream += constant_words
+
+    instructions += [output_stream]
+    index += 1
+
+# Assmebly over; just emit tags at this point
+binary = bytearray()
+
+for (idx, ins) in enumerate(instructions):
+    # Instruction prefetch
+    tag = 0
+
+    if idx + 1 < len(instructions):
+        tag = instructions[idx + 1][0] & 0xF
+
+        # Check for ALU special case
+
+        if is_tag_alu(tag) and idx + 2 == len(instructions):
+            tag = 1
+    else:
+        # Instruction stream over
+        
+        tag = 1
+
+    ins[0] |= tag << 4
+
+    binary += ins
+
+pprint.pprint(program)
+
+with open(sys.argv[2], "wb") as f:
+    f.write(binary)
diff --git a/src/gallium/drivers/panfrost/midgard/cmdline.c b/src/gallium/drivers/panfrost/midgard/cmdline.c
new file mode 100644
index 0000000000..15d2bbde16
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/cmdline.c
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2018 Alyssa Rosenzweig <alyssa at rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "compiler/glsl/standalone.h"
+#include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/nir_types.h"
+#include "midgard_compile.h"
+#include "disassemble.h"
+#include "util/u_dynarray.h"
+#include "main/mtypes.h"
+
+bool c_do_mat_op_to_vec(struct exec_list *instructions);
+
+static void
+finalise_to_disk(const char *filename, struct util_dynarray *data)
+{
+        FILE *fp;
+        fp = fopen(filename, "wb");
+        fwrite(data->data, 1, data->size, fp);
+        fclose(fp);
+
+        util_dynarray_fini(data);
+}
+
+static void
+compile_shader(char **argv)
+{
+        struct gl_shader_program *prog;
+        nir_shader *nir;
+
+        struct standalone_options options = {
+                .glsl_version = 140,
+                .do_link = true,
+        };
+
+        prog = standalone_compile_shader(&options, 2, argv);
+        prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program->info.stage = MESA_SHADER_FRAGMENT;
+
+        for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
+                if (prog->_LinkedShaders[i] == NULL)
+                        continue;
+
+                c_do_mat_op_to_vec(prog->_LinkedShaders[i]->ir);
+        }
+
+        midgard_program compiled;
+        nir = glsl_to_nir(prog, MESA_SHADER_VERTEX, &midgard_nir_options);
+        midgard_compile_shader_nir(nir, &compiled, false);
+        finalise_to_disk("vertex.bin", &compiled.compiled);
+
+        nir = glsl_to_nir(prog, MESA_SHADER_FRAGMENT, &midgard_nir_options);
+        midgard_compile_shader_nir(nir, &compiled, false);
+        finalise_to_disk("fragment.bin", &compiled.compiled);
+}
+
+static void
+compile_blend(char **argv)
+{
+        struct gl_shader_program *prog;
+        nir_shader *nir;
+
+        struct standalone_options options = {
+                .glsl_version = 140,
+        };
+
+        prog = standalone_compile_shader(&options, 1, argv);
+        prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program->info.stage = MESA_SHADER_FRAGMENT;
+
+#if 0
+
+        for (unsigned i = 0; i < MESA_SHADER_STAGES; ++i) {
+                if (prog->_LinkedShaders[i] == NULL)
+                        continue;
+
+                c_do_mat_op_to_vec(prog->_LinkedShaders[i]->ir);
+        }
+
+#endif
+
+        midgard_program program;
+        nir = glsl_to_nir(prog, MESA_SHADER_FRAGMENT, &midgard_nir_options);
+        midgard_compile_shader_nir(nir, &program, true);
+        finalise_to_disk("blend.bin", &program.compiled);
+}
+
+static void
+disassemble(const char *filename)
+{
+        FILE *fp = fopen(filename, "rb");
+        assert(fp);
+
+        fseek(fp, 0, SEEK_END);
+        int filesize = ftell(fp);
+        rewind(fp);
+
+        unsigned char *code = malloc(filesize);
+        fread(code, 1, filesize, fp);
+        fclose(fp);
+
+        disassemble_midgard(code, filesize);
+        free(code);
+}
+
+int
+main(int argc, char **argv)
+{
+        if (argc < 2) {
+                fprintf(stderr, "Usage: midgard_compiler command [args]\n");
+                fprintf(stderr, "midgard_compiler compile program.vert program.frag\n");
+                fprintf(stderr, "midgard_compiler blend program.blend\n");
+                fprintf(stderr, "midgard_compiler disasm binary.bin\n");
+                exit(1);
+        }
+
+        if (strcmp(argv[1], "compile") == 0) {
+                compile_shader(&argv[2]);
+        } else if (strcmp(argv[1], "blend") == 0) {
+                compile_blend(&argv[2]);
+        } else if (strcmp(argv[1], "disasm") == 0) {
+                disassemble(argv[2]);
+        } else {
+                fprintf(stderr, "Unknown command\n");
+                exit(1);
+        }
+}
diff --git a/src/gallium/drivers/panfrost/midgard/cppwrap.cpp b/src/gallium/drivers/panfrost/midgard/cppwrap.cpp
new file mode 100644
index 0000000000..cf2ca3b7a1
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/cppwrap.cpp
@@ -0,0 +1,9 @@
+struct exec_list;
+
+bool do_mat_op_to_vec(struct exec_list *instructions);
+
+extern "C" {
+	bool c_do_mat_op_to_vec(struct exec_list *instructions) {
+		return do_mat_op_to_vec(instructions);
+	}
+};
diff --git a/src/gallium/drivers/panfrost/midgard/disassemble.c b/src/gallium/drivers/panfrost/midgard/disassemble.c
new file mode 100644
index 0000000000..afde3fdbbc
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/disassemble.c
@@ -0,0 +1,986 @@
+/* Author(s):
+ *   Connor Abbott
+ *   Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott (connor at abbott.cx)
+ * Copyright (c) 2018 Alyssa Rosenzweig (alyssa at rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+#include <inttypes.h>
+#include <string.h>
+#include "midgard.h"
+#include "midgard-parse.h"
+#include "disassemble.h"
+#include "util/half_float.h"
+
+#define DEFINE_CASE(define, str) case define: { printf(str); break; }
+
+static bool is_instruction_int = false;
+
+static void
+print_alu_opcode(midgard_alu_op op)
+{
+        bool int_op = false;
+
+        if (alu_opcode_names[op]) {
+                printf("%s", alu_opcode_names[op]);
+
+                int_op = alu_opcode_names[op][0] == 'i';
+        } else
+                printf("alu_op_%02X", op);
+
+        /* For constant analysis */
+        is_instruction_int = int_op;
+}
+
+static void
+print_ld_st_opcode(midgard_load_store_op op)
+{
+        if (load_store_opcode_names[op])
+                printf("%s", load_store_opcode_names[op]);
+        else
+                printf("ldst_op_%02X", op);
+}
+
+static bool is_embedded_constant_half = false;
+static bool is_embedded_constant_int = false;
+
+static void
+print_reg(unsigned reg, bool half)
+{
+        /* Perform basic static analysis for expanding constants correctly */
+
+        if (half && (reg >> 1) == 26) {
+                is_embedded_constant_half = true;
+                is_embedded_constant_int = is_instruction_int;
+        } else if (!half && reg == 26) {
+                is_embedded_constant_int = is_instruction_int;
+        }
+
+        if (half)
+                printf("h");
+
+        printf("r%u", reg);
+}
+
+static char *outmod_names[4] = {
+        "",
+        ".pos",
+        "",
+        ".sat"
+};
+
+static void
+print_outmod(midgard_outmod outmod)
+{
+        printf("%s", outmod_names[outmod]);
+}
+
+static void
+print_quad_word(uint32_t *words, unsigned tabs)
+{
+        unsigned i;
+
+        for (i = 0; i < 4; i++)
+                printf("0x%08X%s ", words[i], i == 3 ? "" : ",");
+
+        printf("\n");
+}
+
+static void
+print_vector_src(unsigned src_binary, bool out_high,
+                 bool out_half, unsigned reg)
+{
+        midgard_vector_alu_src *src = (midgard_vector_alu_src *)&src_binary;
+
+        if (src->negate)
+                printf("-");
+
+        if (src->abs)
+                printf("abs(");
+
+        //register
+
+        if (out_half) {
+                if (src->half)
+                        printf(" /* half */ ");
+
+                unsigned half_reg;
+
+                if (out_high) {
+                        if (src->rep_low)
+                                half_reg = reg * 2;
+                        else
+                                half_reg = reg * 2 + 1;
+
+                        if (src->rep_high)
+                                printf(" /* rep_high */ ");
+                } else {
+                        if (src->rep_high)
+                                half_reg = reg * 2 + 1;
+                        else
+                                half_reg = reg * 2;
+
+                        if (src->rep_low)
+                                printf(" /* rep_low */ ");
+                }
+
+                print_reg(half_reg, true);
+        } else {
+                if (src->rep_high)
+                        printf(" /* rep_high */ ");
+
+                if (src->half)
+                        print_reg(reg * 2 + src->rep_low, true);
+                else {
+                        if (src->rep_low)
+                                printf(" /* rep_low */ ");
+
+                        print_reg(reg, false);
+                }
+        }
+
+        //swizzle
+
+        if (src->swizzle != 0xE4) { //default swizzle
+                unsigned i;
+                static const char c[4] = "xyzw";
+
+                printf(".");
+
+                for (i = 0; i < 4; i++)
+                        printf("%c", c[(src->swizzle >> (i * 2)) & 3]);
+        }
+
+        if (src->abs)
+                printf(")");
+}
+
+static uint16_t
+decode_vector_imm(unsigned src2_reg, unsigned imm)
+{
+        uint16_t ret;
+        ret = src2_reg << 11;
+        ret |= (imm & 0x7) << 8;
+        ret |= (imm >> 3) & 0xFF;
+        return ret;
+}
+
+static void
+print_immediate(uint16_t imm)
+{
+        if (is_instruction_int)
+                printf("#%d", imm);
+        else
+                printf("#%g", _mesa_half_to_float(imm));
+}
+
+static void
+print_vector_field(const char *name, uint16_t *words, uint16_t reg_word,
+                   unsigned tabs)
+{
+        midgard_reg_info *reg_info = (midgard_reg_info *)&reg_word;
+        midgard_vector_alu *alu_field = (midgard_vector_alu *) words;
+
+        if (alu_field->reg_mode != midgard_reg_mode_half &&
+                        alu_field->reg_mode != midgard_reg_mode_full) {
+                printf("unknown reg mode %u\n", alu_field->reg_mode);
+        }
+
+        /* For now, prefix instruction names with their unit, until we
+         * understand how this works on a deeper level */
+        printf("%s.", name);
+
+        print_alu_opcode(alu_field->op);
+        print_outmod(alu_field->outmod);
+        printf(" ");
+
+        bool half, out_half, out_high = false;
+        unsigned mask;
+
+        half = (alu_field->reg_mode == midgard_reg_mode_half);
+
+        if (half) {
+                if (alu_field->mask & 0xF) {
+                        out_high = false;
+
+                        if ((alu_field->mask & 0xF0))
+                                printf("/* %X */ ", alu_field->mask);
+
+                        mask = alu_field->mask;
+                } else {
+                        out_high = true;
+                        mask = alu_field->mask >> 4;
+                }
+        } else {
+                mask = alu_field->mask & 1;
+                mask |= (alu_field->mask & 4) >> 1;
+                mask |= (alu_field->mask & 16) >> 2;
+                mask |= (alu_field->mask & 64) >> 3;
+        }
+
+        out_half = half;
+
+        if (alu_field->dest_override != midgard_dest_override_none) {
+                if (out_half)
+                        printf("/* half */ ");
+
+                out_half = true;
+
+                if (alu_field->dest_override == midgard_dest_override_lower)
+                        out_high = false;
+                else if (alu_field->dest_override == midgard_dest_override_upper)
+                        out_high = true;
+                else
+                        assert(0);
+        }
+
+        if (out_half) {
+                if (out_high)
+                        print_reg(2 * reg_info->out_reg + 1, true);
+                else
+                        print_reg(2 * reg_info->out_reg, true);
+        } else
+                print_reg(reg_info->out_reg, false);
+
+        if (mask != 0xF) {
+                unsigned i;
+                static const char c[4] = "xyzw";
+
+                printf(".");
+
+                for (i = 0; i < 4; i++)
+                        if (mask & (1 << i))
+                                printf("%c", c[i]);
+        }
+
+        printf(", ");
+
+        print_vector_src(alu_field->src1, out_high, half, reg_info->src1_reg);
+
+        printf(", ");
+
+        if (reg_info->src2_imm) {
+                uint16_t imm = decode_vector_imm(reg_info->src2_reg, alu_field->src2 >> 2);
+                print_immediate(imm);
+        } else {
+                print_vector_src(alu_field->src2, out_high, half,
+                                 reg_info->src2_reg);
+        }
+
+        printf("\n");
+}
+
+static void
+print_scalar_src(unsigned src_binary, unsigned reg)
+{
+        midgard_scalar_alu_src *src = (midgard_scalar_alu_src *)&src_binary;
+
+        if (src->negate)
+                printf("-");
+
+        if (src->abs)
+                printf("abs(");
+
+        if (src->full)
+                print_reg(reg, false);
+        else
+                print_reg(reg * 2 + (src->component >> 2), true);
+
+        static const char c[4] = "xyzw";
+        \
+        printf(".%c", c[src->full ? src->component >> 1 : src->component & 3]);
+
+        if (src->abs)
+                printf(")");
+
+}
+
+static uint16_t
+decode_scalar_imm(unsigned src2_reg, unsigned imm)
+{
+        uint16_t ret;
+        ret = src2_reg << 11;
+        ret |= (imm & 3) << 9;
+        ret |= (imm & 4) << 6;
+        ret |= (imm & 0x38) << 2;
+        ret |= imm >> 6;
+        return ret;
+}
+
+static void
+print_scalar_field(const char *name, uint16_t *words, uint16_t reg_word,
+                   unsigned tabs)
+{
+        midgard_reg_info *reg_info = (midgard_reg_info *)&reg_word;
+        midgard_scalar_alu *alu_field = (midgard_scalar_alu *) words;
+
+        if (alu_field->unknown)
+                printf("scalar ALU unknown bit set\n");
+
+        printf("%s.", name);
+        print_alu_opcode(alu_field->op);
+        print_outmod(alu_field->outmod);
+        printf(" ");
+
+        if (alu_field->output_full)
+                print_reg(reg_info->out_reg, false);
+        else
+                print_reg(reg_info->out_reg * 2 + (alu_field->output_component >> 2),
+                          true);
+
+        static const char c[4] = "xyzw";
+        printf(".%c, ",
+               c[alu_field->output_full ? alu_field->output_component >> 1 :
+                                        alu_field->output_component & 3]);
+
+        print_scalar_src(alu_field->src1, reg_info->src1_reg);
+
+        printf(", ");
+
+        if (reg_info->src2_imm) {
+                uint16_t imm = decode_scalar_imm(reg_info->src2_reg,
+                                                 alu_field->src2);
+                print_immediate(imm);
+        } else
+                print_scalar_src(alu_field->src2, reg_info->src2_reg);
+
+        printf("\n");
+}
+
+static void
+print_branch_op(int op)
+{
+        switch (op) {
+        case midgard_jmp_writeout_op_branch_cond:
+                printf("cond.");
+                break;
+
+        case midgard_jmp_writeout_op_writeout:
+                printf("write.");
+                break;
+
+        case midgard_jmp_writeout_op_discard:
+                printf("discard.");
+                break;
+
+        default:
+                printf("unk%d.", op);
+                break;
+        }
+}
+
+static void
+print_branch_cond(int cond)
+{
+        switch (cond) {
+        case midgard_condition_write0:
+                printf("write0");
+                break;
+
+        case midgard_condition_false:
+                printf("false");
+                break;
+
+        case midgard_condition_true:
+                printf("true");
+                break;
+
+        case midgard_condition_always:
+                printf("always");
+                break;
+
+        default:
+                break;
+        }
+}
+
+static void
+print_compact_branch_writeout_field(uint16_t word)
+{
+        midgard_jmp_writeout_op op = word & 0x7;
+
+        switch (op) {
+        case midgard_jmp_writeout_op_branch_uncond: {
+                midgard_branch_uncond br_uncond;
+                memcpy((char *) &br_uncond, (char *) &word, sizeof(br_uncond));
+                printf("br.uncond ");
+
+                if (br_uncond.unknown != 1)
+                        printf("unknown:%d, ", br_uncond.unknown);
+
+                if (br_uncond.offset >= 0)
+                        printf("+");
+
+                printf("%d", br_uncond.offset);
+
+                printf(" -> %X\n", br_uncond.dest_tag);
+                break;
+        }
+
+        case midgard_jmp_writeout_op_branch_cond:
+        case midgard_jmp_writeout_op_writeout:
+        case midgard_jmp_writeout_op_discard:
+        default: {
+                midgard_branch_cond br_cond;
+                memcpy((char *) &br_cond, (char *) &word, sizeof(br_cond));
+
+                printf("br.");
+
+                print_branch_op(br_cond.op);
+                print_branch_cond(br_cond.cond);
+
+                printf(" ");
+
+                if (br_cond.offset >= 0)
+                        printf("+");
+
+                printf("%d", br_cond.offset);
+
+                printf(" -> %X\n", br_cond.dest_tag);
+                break;
+        }
+        }
+}
+
+static void
+print_extended_branch_writeout_field(uint8_t *words)
+{
+        midgard_branch_extended br;
+        memcpy((char *) &br, (char *) words, sizeof(br));
+
+        printf("br.");
+
+        print_branch_op(br.op);
+        print_branch_cond(br.cond);
+
+        /* XXX: This can't be right */
+        if (br.unknown)
+                printf(".unknown%d\n", br.unknown);
+
+        if (br.zero)
+                printf(".zero%d\n", br.zero);
+
+        printf(" ");
+
+        if (br.offset >= 0)
+                printf("+");
+
+        printf("%d", br.offset);
+
+        printf(" -> %X\n", br.dest_tag);
+}
+
+static unsigned
+num_alu_fields_enabled(uint32_t control_word)
+{
+        unsigned ret = 0;
+
+        if ((control_word >> 17) & 1)
+                ret++;
+
+        if ((control_word >> 19) & 1)
+                ret++;
+
+        if ((control_word >> 21) & 1)
+                ret++;
+
+        if ((control_word >> 23) & 1)
+                ret++;
+
+        if ((control_word >> 25) & 1)
+                ret++;
+
+        return ret;
+}
+
+static float
+float_bitcast(uint32_t integer)
+{
+        union {
+                uint32_t i;
+                float f;
+        } v;
+
+        v.i = integer;
+        return v.f;
+}
+
+static void
+print_alu_word(uint32_t *words, unsigned num_quad_words,
+               unsigned tabs)
+{
+        uint32_t control_word = words[0];
+        uint16_t *beginning_ptr = (uint16_t *)(words + 1);
+        unsigned num_fields = num_alu_fields_enabled(control_word);
+        uint16_t *word_ptr = beginning_ptr + num_fields;
+        unsigned num_words = 2 + num_fields;
+
+        if ((control_word >> 16) & 1)
+                printf("unknown bit 16 enabled\n");
+
+        if ((control_word >> 17) & 1) {
+                print_vector_field("vmul", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if ((control_word >> 18) & 1)
+                printf("unknown bit 18 enabled\n");
+
+        if ((control_word >> 19) & 1) {
+                print_scalar_field("sadd", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 2;
+                num_words += 2;
+        }
+
+        if ((control_word >> 20) & 1)
+                printf("unknown bit 20 enabled\n");
+
+        if ((control_word >> 21) & 1) {
+                print_vector_field("vadd", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if ((control_word >> 22) & 1)
+                printf("unknown bit 22 enabled\n");
+
+        if ((control_word >> 23) & 1) {
+                print_scalar_field("smul", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 2;
+                num_words += 2;
+        }
+
+        if ((control_word >> 24) & 1)
+                printf("unknown bit 24 enabled\n");
+
+        if ((control_word >> 25) & 1) {
+                print_vector_field("lut", word_ptr, *beginning_ptr, tabs);
+                beginning_ptr += 1;
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if ((control_word >> 26) & 1) {
+                print_compact_branch_writeout_field(*word_ptr);
+                word_ptr += 1;
+                num_words += 1;
+        }
+
+        if ((control_word >> 27) & 1) {
+                print_extended_branch_writeout_field((uint8_t *) word_ptr);
+                word_ptr += 3;
+                num_words += 3;
+        }
+
+        if (num_quad_words > (num_words + 7) / 8) {
+                assert(num_quad_words == (num_words + 15) / 8);
+                //Assume that the extra quadword is constants
+                void *consts = words + (4 * num_quad_words - 4);
+
+                if (is_embedded_constant_int) {
+                        if (is_embedded_constant_half) {
+                                int16_t *sconsts = (int16_t *) consts;
+                                printf("sconstants %d, %d, %d, %d\n",
+                                       sconsts[0],
+                                       sconsts[1],
+                                       sconsts[2],
+                                       sconsts[3]);
+                        } else {
+                                int32_t *iconsts = (int32_t *) consts;
+                                printf("iconstants %d, %d, %d, %d\n",
+                                       iconsts[0],
+                                       iconsts[1],
+                                       iconsts[2],
+                                       iconsts[3]);
+                        }
+                } else {
+                        if (is_embedded_constant_half) {
+                                uint16_t *hconsts = (uint16_t *) consts;
+                                printf("hconstants %g, %g, %g, %g\n",
+                                       _mesa_half_to_float(hconsts[0]),
+                                       _mesa_half_to_float(hconsts[1]),
+                                       _mesa_half_to_float(hconsts[2]),
+                                       _mesa_half_to_float(hconsts[3]));
+                        } else {
+                                uint32_t *fconsts = (uint32_t *) consts;
+                                printf("fconstants %g, %g, %g, %g\n",
+                                       float_bitcast(fconsts[0]),
+                                       float_bitcast(fconsts[1]),
+                                       float_bitcast(fconsts[2]),
+                                       float_bitcast(fconsts[3]));
+                        }
+
+                }
+        }
+}
+
+/* Swizzle/mask formats are common between load/store ops and texture ops, it
+ * looks like... */
+
+static void
+print_swizzle(uint32_t swizzle)
+{
+        unsigned i;
+
+        if (swizzle != 0xE4) {
+                printf(".");
+
+                for (i = 0; i < 4; i++)
+                        printf("%c", "xyzw"[(swizzle >> (2 * i)) & 3]);
+        }
+}
+
+static void
+print_mask(uint32_t mask)
+{
+        unsigned i;
+
+        if (mask != 0xF) {
+                printf(".");
+
+                for (i = 0; i < 4; i++)
+                        if (mask & (1 << i))
+                                printf("%c", "xyzw"[i]);
+
+                /* Handle degenerate case */
+                if (mask == 0)
+                        printf("0");
+        }
+}
+
+static void
+print_varying_parameters(midgard_load_store_word *word)
+{
+        midgard_varying_parameter param;
+        unsigned v = word->varying_parameters;
+        memcpy(&param, &v, sizeof(param));
+
+        if (param.is_varying) {
+                /* If a varying, there are qualifiers */
+                if (param.flat)
+                        printf(".flat");
+
+                if (param.interpolation != midgard_interp_default) {
+                        if (param.interpolation == midgard_interp_centroid)
+                                printf(".centroid");
+                        else
+                                printf(".interp%d", param.interpolation);
+                }
+        } else if (param.flat || param.interpolation) {
+                printf(" /* is_varying not set but varying metadata attached */");
+        }
+
+        if (param.zero1 || param.zero2)
+                printf(" /* zero tripped, %d %d */ ", param.zero1, param.zero2);
+}
+
+static bool
+is_op_varying(unsigned op)
+{
+        switch (op) {
+        case midgard_op_store_vary_16:
+        case midgard_op_store_vary_32:
+        case midgard_op_load_vary_16:
+        case midgard_op_load_vary_32:
+                return true;
+        }
+
+        return false;
+}
+
+static void
+print_load_store_instr(uint64_t data,
+                       unsigned tabs)
+{
+        midgard_load_store_word *word = (midgard_load_store_word *) &data;
+
+        print_ld_st_opcode(word->op);
+
+        if (is_op_varying(word->op))
+                print_varying_parameters(word);
+
+        printf(" r%d", word->reg);
+        print_mask(word->mask);
+
+        int address = word->address;
+
+        if (word->op == midgard_op_load_uniform_32) {
+                /* Uniforms use their own addressing scheme */
+
+                int lo = word->varying_parameters >> 7;
+                int hi = word->address;
+
+                /* TODO: Combine fields logically */
+                address = (hi << 3) | lo;
+        }
+
+        printf(", %d", address);
+
+        print_swizzle(word->swizzle);
+
+        printf(", 0x%X\n", word->unknown);
+}
+
+static void
+print_load_store_word(uint32_t *word, unsigned tabs)
+{
+        midgard_load_store *load_store = (midgard_load_store *) word;
+
+        if (load_store->word1 != 3) {
+                print_load_store_instr(load_store->word1, tabs);
+        }
+
+        if (load_store->word2 != 3) {
+                print_load_store_instr(load_store->word2, tabs);
+        }
+}
+
+static void
+print_texture_reg(bool full, bool select, bool upper)
+{
+        if (full)
+                printf("r%d", REG_TEX_BASE + select);
+        else
+                printf("hr%d", (REG_TEX_BASE + select) * 2 + upper);
+
+        if (full && upper)
+                printf("// error: out full / upper mutually exclusive\n");
+
+}
+
+static void
+print_texture_format(int format)
+{
+        /* Act like a modifier */
+        printf(".");
+
+        switch (format) {
+                DEFINE_CASE(TEXTURE_2D, "2d");
+                DEFINE_CASE(TEXTURE_3D, "3d");
+
+        default:
+                printf("fmt_%d", format);
+                break;
+        }
+}
+
+static void
+print_texture_op(int format)
+{
+        /* Act like a modifier */
+        printf(".");
+
+        switch (format) {
+                DEFINE_CASE(TEXTURE_OP_NORMAL, "normal");
+                DEFINE_CASE(TEXTURE_OP_TEXEL_FETCH, "texelfetch");
+
+        default:
+                printf("op_%d", format);
+                break;
+        }
+}
+
+#undef DEFINE_CASE
+
+static void
+print_texture_word(uint32_t *word, unsigned tabs)
+{
+        midgard_texture_word *texture = (midgard_texture_word *) word;
+
+        /* Instruction family, like ALU words have theirs */
+        printf("texture");
+
+        /* Broad category of texture operation in question */
+        print_texture_op(texture->op);
+
+        /* Specific format in question */
+        print_texture_format(texture->format);
+
+        /* Instruction "modifiers" parallel the ALU instructions. First group
+         * are modifiers that act alone */
+
+        if (!texture->filter)
+                printf(".raw");
+
+        if (texture->shadow)
+                printf(".shadow");
+
+        if (texture->cont)
+                printf(".cont");
+
+        if (texture->last)
+                printf(".last");
+
+        /* Second set are modifiers which take an extra argument each */
+
+        if (texture->has_offset)
+                printf(".offset");
+
+        if (texture->bias)
+                printf(".bias");
+
+        printf(" ");
+
+        print_texture_reg(texture->out_full, texture->out_reg_select, texture->out_upper);
+        print_mask(texture->mask);
+        printf(", ");
+
+        printf("texture%d, ", texture->texture_handle);
+
+        printf("sampler%d", texture->sampler_handle);
+        print_swizzle(texture->swizzle);
+        printf(", ");
+
+        print_texture_reg(/*texture->in_reg_full*/true, texture->in_reg_select, texture->in_reg_upper);
+        printf(".%c%c, ", "xyzw"[texture->in_reg_swizzle_left],
+               "xyzw"[texture->in_reg_swizzle_right]);
+
+        /* TODO: can offsets be full words? */
+        if (texture->has_offset) {
+                print_texture_reg(false, texture->offset_reg_select, texture->offset_reg_upper);
+                printf(", ");
+        }
+
+        if (texture->bias)
+                printf("%f, ", texture->bias / 256.0f);
+
+        printf("\n");
+
+        /* While not zero in general, for these simple instructions the
+         * following unknowns are zero, so we don't include them */
+
+        if (texture->unknown1 ||
+                        texture->unknown2 ||
+                        texture->unknown3 ||
+                        texture->unknown4 ||
+                        texture->unknownA ||
+                        texture->unknownB ||
+                        texture->unknown8 ||
+                        texture->unknown9) {
+                printf("// unknown1 = 0x%x\n", texture->unknown1);
+                printf("// unknown2 = 0x%x\n", texture->unknown2);
+                printf("// unknown3 = 0x%x\n", texture->unknown3);
+                printf("// unknown4 = 0x%x\n", texture->unknown4);
+                printf("// unknownA = 0x%x\n", texture->unknownA);
+                printf("// unknownB = 0x%x\n", texture->unknownB);
+                printf("// unknown8 = 0x%x\n", texture->unknown8);
+                printf("// unknown9 = 0x%x\n", texture->unknown9);
+        }
+
+        /* Similarly, if no offset is applied, these are zero. If an offset
+         * -is- applied, or gradients are used, etc, these are nonzero but
+         *  largely unknown still. */
+
+        if (texture->offset_unknown1 ||
+                        texture->offset_reg_select ||
+                        texture->offset_reg_upper ||
+                        texture->offset_unknown4 ||
+                        texture->offset_unknown5 ||
+                        texture->offset_unknown6 ||
+                        texture->offset_unknown7 ||
+                        texture->offset_unknown8 ||
+                        texture->offset_unknown9) {
+                printf("// offset_unknown1 = 0x%x\n", texture->offset_unknown1);
+                printf("// offset_reg_select = 0x%x\n", texture->offset_reg_select);
+                printf("// offset_reg_upper = 0x%x\n", texture->offset_reg_upper);
+                printf("// offset_unknown4 = 0x%x\n", texture->offset_unknown4);
+                printf("// offset_unknown5 = 0x%x\n", texture->offset_unknown5);
+                printf("// offset_unknown6 = 0x%x\n", texture->offset_unknown6);
+                printf("// offset_unknown7 = 0x%x\n", texture->offset_unknown7);
+                printf("// offset_unknown8 = 0x%x\n", texture->offset_unknown8);
+                printf("// offset_unknown9 = 0x%x\n", texture->offset_unknown9);
+        }
+
+        /* Don't blow up */
+        if (texture->unknown7 != 0x1)
+                printf("// (!) unknown7 = %d\n", texture->unknown7);
+}
+
+void
+disassemble_midgard(uint8_t *code, size_t size)
+{
+        uint32_t *words = (uint32_t *) code;
+        unsigned num_words = size / 4;
+        int tabs = 0;
+
+        bool prefetch_flag = false;
+
+        unsigned i = 0;
+
+        while (i < num_words) {
+                unsigned num_quad_words = midgard_word_size[words[i] & 0xF];
+
+                switch (midgard_word_types[words[i] & 0xF]) {
+                case midgard_word_type_texture:
+                        print_texture_word(&words[i], tabs);
+                        break;
+
+                case midgard_word_type_load_store:
+                        print_load_store_word(&words[i], tabs);
+                        break;
+
+                case midgard_word_type_alu:
+                        print_alu_word(&words[i], num_quad_words, tabs);
+
+                        if (prefetch_flag)
+                                return;
+
+                        /* Reset word static analysis state */
+                        is_embedded_constant_half = false;
+                        is_embedded_constant_int = false;
+
+                        break;
+
+                default:
+                        printf("Unknown word type %u:\n", words[i] & 0xF);
+                        num_quad_words = 1;
+                        print_quad_word(&words[i], tabs);
+                        printf("\n");
+                        break;
+                }
+
+                printf("\n");
+
+                unsigned next = (words[i] & 0xF0) >> 4;
+
+                i += 4 * num_quad_words;
+
+                /* Break based on instruction prefetch flag */
+
+                if (i < num_words && next == 1) {
+                        prefetch_flag = true;
+
+                        if (midgard_word_types[words[i] & 0xF] != midgard_word_type_alu)
+                                return;
+                }
+        }
+
+        return;
+}
diff --git a/src/gallium/drivers/panfrost/midgard/disassemble.h b/src/gallium/drivers/panfrost/midgard/disassemble.h
new file mode 100644
index 0000000000..ab1837c201
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/disassemble.h
@@ -0,0 +1,2 @@
+#include <stddef.h>
+void disassemble_midgard(uint8_t *code, size_t size);
diff --git a/src/gallium/drivers/panfrost/midgard/helpers.h b/src/gallium/drivers/panfrost/midgard/helpers.h
new file mode 100644
index 0000000000..9e365dc340
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/helpers.h
@@ -0,0 +1,236 @@
+/* Author(s):
+ *  Alyssa Rosenzweig
+ *
+ * Copyright (c) 2018 Alyssa Rosenzweig (alyssa at rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/* Some constants and macros not found in the disassembler */
+
+#define OP_IS_STORE(op) (\
+		op == midgard_op_store_vary_16 || \
+		op == midgard_op_store_vary_32 \
+	)
+
+/* ALU control words are single bit fields with a lot of space */
+
+#define ALU_ENAB_VEC_MUL  (1 << 17)
+#define ALU_ENAB_SCAL_ADD  (1 << 19)
+#define ALU_ENAB_VEC_ADD  (1 << 21)
+#define ALU_ENAB_SCAL_MUL  (1 << 23)
+#define ALU_ENAB_VEC_LUT  (1 << 25)
+#define ALU_ENAB_BR_COMPACT (1 << 26)
+#define ALU_ENAB_BRANCH   (1 << 27)
+
+/* Other opcode properties that don't conflict with the ALU_ENABs, non-ISA */
+
+/* Denotes an opcode that takes a vector input with a fixed-number of
+ * channels, but outputs to only a single output channel, like dot products.
+ * For these, to determine the effective mask, this quirk can be set. We have
+ * an intentional off-by-one (a la MALI_POSITIVE), since 0-channel makes no
+ * sense but we need to fit 4 channels in 2-bits. Similarly, 1-channel doesn't
+ * make sense (since then why are we quirked?), so that corresponds to "no
+ * count set" */
+
+#define OP_CHANNEL_COUNT(c) ((c - 1) << 0)
+#define GET_CHANNEL_COUNT(c) ((c & (0x3 << 0)) ? ((c & (0x3 << 0)) + 1) : 0)
+
+/* Vector-independant shorthands for the above; these numbers are arbitrary and
+ * not from the ISA. Convert to the above with unit_enum_to_midgard */
+
+#define UNIT_MUL 0
+#define UNIT_ADD 1
+#define UNIT_LUT 2
+
+/* 4-bit type tags */
+
+#define TAG_TEXTURE_4 0x3
+#define TAG_LOAD_STORE_4 0x5
+#define TAG_ALU_4 0x8
+#define TAG_ALU_8 0x9
+#define TAG_ALU_12 0xA
+#define TAG_ALU_16 0xB
+
+/* Special register aliases */
+
+#define MAX_WORK_REGISTERS 16
+
+/* Uniforms are begin at (REGISTER_UNIFORMS - uniform_count) */
+#define REGISTER_UNIFORMS 24
+
+#define REGISTER_UNUSED 24
+#define REGISTER_CONSTANT 26
+#define REGISTER_VARYING_BASE 26
+#define REGISTER_OFFSET 27
+#define REGISTER_TEXTURE_BASE 28
+#define REGISTER_SELECT 31
+
+/* Special uniforms used for e.g. vertex epilogues */
+#define SPECIAL_UNIFORM_BASE (1 << 24)
+#define UNIFORM_VIEWPORT (SPECIAL_UNIFORM_BASE + 0)
+
+/* SSA helper aliases to mimic the registers. UNUSED_0 encoded as an inline
+ * constant. UNUSED_1 encoded as REGISTER_UNUSED */
+
+#define SSA_UNUSED_0 0
+#define SSA_UNUSED_1 -2
+
+#define SSA_FIXED_SHIFT 24
+#define SSA_FIXED_REGISTER(reg) ((1 + reg) << SSA_FIXED_SHIFT)
+#define SSA_REG_FROM_FIXED(reg) ((reg >> SSA_FIXED_SHIFT) - 1)
+#define SSA_FIXED_MINIMUM SSA_FIXED_REGISTER(0)
+
+/* Swizzle support */
+
+#define SWIZZLE(A, B, C, D) ((D << 6) | (C << 4) | (B << 2) | (A << 0))
+#define SWIZZLE_FROM_ARRAY(r) SWIZZLE(r[0], r[1], r[2], r[3])
+#define COMPONENT_X 0x0
+#define COMPONENT_Y 0x1
+#define COMPONENT_Z 0x2
+#define COMPONENT_W 0x3
+
+/* See ISA notes */
+
+#define LDST_NOP (3)
+
+/* Is this opcode that of an integer? */
+static bool
+midgard_is_integer_op(int op)
+{
+        switch (op) {
+        case midgard_alu_op_iadd:
+        case midgard_alu_op_ishladd:
+        case midgard_alu_op_isub:
+        case midgard_alu_op_imul:
+        case midgard_alu_op_imin:
+        case midgard_alu_op_imax:
+        case midgard_alu_op_iasr:
+        case midgard_alu_op_ilsr:
+        case midgard_alu_op_ishl:
+        case midgard_alu_op_iand:
+        case midgard_alu_op_ior:
+        case midgard_alu_op_inot:
+        case midgard_alu_op_iandnot:
+        case midgard_alu_op_ixor:
+        case midgard_alu_op_imov:
+
+        //case midgard_alu_op_f2i:
+        //case midgard_alu_op_f2u:
+        case midgard_alu_op_ieq:
+        case midgard_alu_op_ine:
+        case midgard_alu_op_ilt:
+        case midgard_alu_op_ile:
+        case midgard_alu_op_iball_eq:
+        case midgard_alu_op_ibany_neq:
+
+        //case midgard_alu_op_i2f:
+        //case midgard_alu_op_u2f:
+        case midgard_alu_op_icsel:
+                return true;
+
+        default:
+                return false;
+        }
+}
+
+/* There are five ALU units: VMUL, VADD, SMUL, SADD, LUT. A given opcode is
+ * implemented on some subset of these units (or occassionally all of them).
+ * This table encodes a bit mask of valid units for each opcode, so the
+ * scheduler can figure where to plonk the instruction. */
+
+/* Shorthands for each unit */
+#define UNIT_VMUL ALU_ENAB_VEC_MUL
+#define UNIT_SADD ALU_ENAB_SCAL_ADD
+#define UNIT_VADD ALU_ENAB_VEC_ADD
+#define UNIT_SMUL ALU_ENAB_SCAL_MUL
+#define UNIT_VLUT ALU_ENAB_VEC_LUT
+
+/* Shorthands for usual combinations of units. LUT is intentionally excluded
+ * since it's nutty. */
+
+#define UNITS_MUL (UNIT_VMUL | UNIT_SMUL)
+#define UNITS_ADD (UNIT_VADD | UNIT_SADD)
+#define UNITS_ALL (UNITS_MUL | UNITS_ADD)
+#define UNITS_SCALAR (UNIT_SADD | UNIT_SMUL)
+#define UNITS_VECTOR (UNIT_VMUL | UNIT_VADD)
+#define UNITS_ANY_VECTOR (UNITS_VECTOR | UNIT_VLUT)
+
+static int alu_opcode_props[256] = {
+        [midgard_alu_op_fadd]		 = UNITS_ADD,
+        [midgard_alu_op_fmul]		 = UNITS_MUL | UNIT_VLUT,
+        [midgard_alu_op_fmin]		 = UNITS_MUL | UNITS_ADD,
+        [midgard_alu_op_fmax]		 = UNITS_MUL | UNITS_ADD,
+        [midgard_alu_op_imin]		 = UNITS_ALL,
+        [midgard_alu_op_imax]		 = UNITS_ALL,
+        [midgard_alu_op_fmov]		 = UNITS_ALL | UNIT_VLUT,
+        [midgard_alu_op_ffloor]		 = UNITS_ADD,
+        [midgard_alu_op_fceil]		 = UNITS_ADD,
+
+        /* Though they output a scalar, they need to run on a vector unit
+         * since they process vectors */
+        [midgard_alu_op_fdot3]		 = UNIT_VMUL | OP_CHANNEL_COUNT(3),
+        [midgard_alu_op_fdot4]		 = UNIT_VMUL | OP_CHANNEL_COUNT(4),
+
+        [midgard_alu_op_iadd]		 = UNITS_ADD,
+        [midgard_alu_op_isub]		 = UNITS_ADD,
+        [midgard_alu_op_imul]		 = UNITS_ALL,
+        [midgard_alu_op_imov]		 = UNITS_ALL,
+
+        /* For vector comparisons, use ball etc */
+        [midgard_alu_op_feq]		 = UNITS_ALL,
+        [midgard_alu_op_fne]		 = UNITS_ALL,
+        [midgard_alu_op_flt]		 = UNIT_SADD,
+        [midgard_alu_op_ieq]		 = UNITS_ALL,
+        [midgard_alu_op_ine]		 = UNITS_ALL,
+        [midgard_alu_op_ilt]		 = UNITS_ALL,
+        [midgard_alu_op_ile]		 = UNITS_ALL,
+
+        [midgard_alu_op_icsel]		 = UNITS_ADD,
+        [midgard_alu_op_fcsel]		 = UNITS_ADD | UNIT_SMUL,
+
+        [midgard_alu_op_frcp]		 = UNIT_VLUT,
+        [midgard_alu_op_frsqrt]		 = UNIT_VLUT,
+        [midgard_alu_op_fsqrt]		 = UNIT_VLUT,
+        [midgard_alu_op_fexp2]		 = UNIT_VLUT,
+        [midgard_alu_op_flog2]		 = UNIT_VLUT,
+
+        [midgard_alu_op_f2i]		 = UNITS_ADD,
+        [midgard_alu_op_f2u]		 = UNITS_ADD,
+        [midgard_alu_op_f2u8]		 = UNITS_ADD,
+        [midgard_alu_op_i2f]		 = UNITS_ADD,
+        [midgard_alu_op_u2f]		 = UNITS_ADD,
+
+        [midgard_alu_op_fsin]		 = UNIT_VLUT,
+        [midgard_alu_op_fcos]		 = UNIT_VLUT,
+
+        [midgard_alu_op_iand]		 = UNITS_ADD, /* XXX: Test case where it's right on smul but not sadd */
+        [midgard_alu_op_ior]		 = UNITS_ADD,
+        [midgard_alu_op_ixor]		 = UNITS_ADD,
+        [midgard_alu_op_inot]		 = UNITS_ALL,
+        [midgard_alu_op_ishl]		 = UNITS_ADD,
+        [midgard_alu_op_iasr]		 = UNITS_ADD,
+        [midgard_alu_op_ilsr]		 = UNITS_ADD,
+        [midgard_alu_op_ilsr]		 = UNITS_ADD,
+
+        [midgard_alu_op_fball_eq]	 = UNITS_ALL,
+        [midgard_alu_op_fbany_neq]	 = UNITS_ALL,
+        [midgard_alu_op_iball_eq]	 = UNITS_ALL,
+        [midgard_alu_op_ibany_neq]	 = UNITS_ALL
+};
diff --git a/src/gallium/drivers/panfrost/midgard/midgard-parse.h b/src/gallium/drivers/panfrost/midgard/midgard-parse.h
new file mode 100644
index 0000000000..5d13483940
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/midgard-parse.h
@@ -0,0 +1,70 @@
+/* Author(s):
+ *   Connor Abbott
+ *   Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott (connor at abbott.cx)
+ * Copyright (c) 2018 Alyssa Rosenzweig (alyssa at rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __midgard_parse_h__
+#define __midgard_parse_h__
+
+/* Additional metadata for parsing Midgard binaries, not needed for compilation */
+
+static midgard_word_type midgard_word_types[16] = {
+        midgard_word_type_unknown,    /* 0x0 */
+        midgard_word_type_unknown,    /* 0x1 */
+        midgard_word_type_texture,    /* 0x2 */
+        midgard_word_type_texture,    /* 0x3 */
+        midgard_word_type_unknown,    /* 0x4 */
+        midgard_word_type_load_store, /* 0x5 */
+        midgard_word_type_unknown,    /* 0x6 */
+        midgard_word_type_unknown,    /* 0x7 */
+        midgard_word_type_alu,        /* 0x8 */
+        midgard_word_type_alu,        /* 0x9 */
+        midgard_word_type_alu,        /* 0xA */
+        midgard_word_type_alu,        /* 0xB */
+        midgard_word_type_alu,        /* 0xC */
+        midgard_word_type_alu,        /* 0xD */
+        midgard_word_type_alu,        /* 0xE */
+        midgard_word_type_alu,        /* 0xF */
+};
+
+static unsigned midgard_word_size[16] = {
+        0, /* 0x0 */
+        0, /* 0x1 */
+        1, /* 0x2 */
+        1, /* 0x3 */
+        0, /* 0x4 */
+        1, /* 0x5 */
+        0, /* 0x6 */
+        0, /* 0x7 */
+        1, /* 0x8 */
+        2, /* 0x9 */
+        3, /* 0xA */
+        4, /* 0xB */
+        1, /* 0xC */
+        2, /* 0xD */
+        3, /* 0xE */
+        4, /* 0xF */
+};
+
+#endif
diff --git a/src/gallium/drivers/panfrost/midgard/midgard.h b/src/gallium/drivers/panfrost/midgard/midgard.h
new file mode 100644
index 0000000000..b6cd38a5cd
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/midgard.h
@@ -0,0 +1,473 @@
+/* Author(s):
+ *   Connor Abbott
+ *   Alyssa Rosenzweig
+ *
+ * Copyright (c) 2013 Connor Abbott (connor at abbott.cx)
+ * Copyright (c) 2018 Alyssa Rosenzweig (alyssa at rosenzweig.io)
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __midgard_h__
+#define __midgard_h__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+typedef enum {
+        midgard_word_type_alu,
+        midgard_word_type_load_store,
+        midgard_word_type_texture,
+        midgard_word_type_unknown
+} midgard_word_type;
+
+typedef enum {
+        midgard_alu_vmul,
+        midgard_alu_sadd,
+        midgard_alu_smul,
+        midgard_alu_vadd,
+        midgard_alu_lut
+} midgard_alu;
+
+/*
+ * ALU words
+ */
+
+typedef enum {
+        midgard_alu_op_fadd       = 0x10,
+        midgard_alu_op_fmul       = 0x14,
+        midgard_alu_op_fmin       = 0x28,
+        midgard_alu_op_fmax       = 0x2C,
+        midgard_alu_op_fmov       = 0x30,
+        midgard_alu_op_ffloor     = 0x36,
+        midgard_alu_op_fceil      = 0x37,
+        midgard_alu_op_fdot3      = 0x3C,
+        midgard_alu_op_fdot3r     = 0x3D,
+        midgard_alu_op_fdot4      = 0x3E,
+        midgard_alu_op_freduce    = 0x3F,
+        midgard_alu_op_iadd       = 0x40,
+        midgard_alu_op_ishladd    = 0x41,
+        midgard_alu_op_isub       = 0x46,
+        midgard_alu_op_imul       = 0x58,
+        midgard_alu_op_imin       = 0x60,
+        midgard_alu_op_imax       = 0x62,
+        midgard_alu_op_iasr       = 0x68,
+        midgard_alu_op_ilsr       = 0x69,
+        midgard_alu_op_ishl       = 0x6E,
+        midgard_alu_op_iand       = 0x70,
+        midgard_alu_op_ior        = 0x71,
+        midgard_alu_op_inot       = 0x72,
+        midgard_alu_op_iandnot    = 0x74, /* (a, b) -> a & ~b, used for not/b2f */
+        midgard_alu_op_ixor       = 0x76,
+        midgard_alu_op_imov       = 0x7B,
+        midgard_alu_op_feq        = 0x80,
+        midgard_alu_op_fne        = 0x81,
+        midgard_alu_op_flt        = 0x82,
+        midgard_alu_op_fle        = 0x83,
+        midgard_alu_op_fball_eq   = 0x88,
+        midgard_alu_op_bball_eq   = 0x89,
+        midgard_alu_op_bbany_neq  = 0x90, /* used for bvec4(1) */
+        midgard_alu_op_fbany_neq  = 0x91, /* bvec4(0) also */
+        midgard_alu_op_f2i        = 0x99,
+        midgard_alu_op_f2u8       = 0x9C,
+        midgard_alu_op_f2u        = 0x9D,
+        midgard_alu_op_ieq        = 0xA0,
+        midgard_alu_op_ine        = 0xA1,
+        midgard_alu_op_ilt        = 0xA4,
+        midgard_alu_op_ile        = 0xA5,
+        midgard_alu_op_iball_eq   = 0xA8,
+        midgard_alu_op_ball       = 0xA9,
+        midgard_alu_op_ibany_neq  = 0xB1,
+        midgard_alu_op_i2f        = 0xB8,
+        midgard_alu_op_u2f        = 0xBC,
+        midgard_alu_op_icsel      = 0xC1,
+        midgard_alu_op_fcsel      = 0xC5,
+        midgard_alu_op_fatan_pt2  = 0xE8,
+        midgard_alu_op_frcp       = 0xF0,
+        midgard_alu_op_frsqrt     = 0xF2,
+        midgard_alu_op_fsqrt      = 0xF3,
+        midgard_alu_op_fexp2      = 0xF4,
+        midgard_alu_op_flog2      = 0xF5,
+        midgard_alu_op_fsin       = 0xF6,
+        midgard_alu_op_fcos       = 0xF7,
+        midgard_alu_op_fatan2_pt1 = 0xF9,
+} midgard_alu_op;
+
+typedef enum {
+        midgard_outmod_none = 0,
+        midgard_outmod_pos  = 1,
+        midgard_outmod_int  = 2,
+        midgard_outmod_sat  = 3
+} midgard_outmod;
+
+typedef enum {
+        midgard_reg_mode_quarter = 0,
+        midgard_reg_mode_half = 1,
+        midgard_reg_mode_full = 2,
+        midgard_reg_mode_double = 3 /* TODO: verify */
+} midgard_reg_mode;
+
+typedef enum {
+        midgard_dest_override_lower = 0,
+        midgard_dest_override_upper = 1,
+        midgard_dest_override_none = 2
+} midgard_dest_override;
+
+typedef struct
+__attribute__((__packed__))
+{
+        bool abs         : 1;
+        bool negate      : 1;
+
+        /* replicate lower half if dest = half, or low/high half selection if
+         * dest = full
+         */
+        bool rep_low     : 1;
+        bool rep_high    : 1; /* unused if dest = full */
+        bool half        : 1; /* only matters if dest = full */
+        unsigned swizzle : 8;
+}
+midgard_vector_alu_src;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_alu_op op               :  8;
+        midgard_reg_mode reg_mode   :  2;
+        unsigned src1 : 13;
+        unsigned src2 : 13;
+        midgard_dest_override dest_override : 2;
+        midgard_outmod outmod               : 2;
+        unsigned mask                           : 8;
+}
+midgard_vector_alu;
+
+typedef struct
+__attribute__((__packed__))
+{
+        bool abs           : 1;
+        bool negate        : 1;
+        bool full          : 1; /* 0 = half, 1 = full */
+        unsigned component : 3;
+}
+midgard_scalar_alu_src;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_alu_op op         :  8;
+        unsigned src1             :  6;
+        unsigned src2             : 11;
+        unsigned unknown          :  1;
+        midgard_outmod outmod :  2;
+        bool output_full          :  1;
+        unsigned output_component :  3;
+}
+midgard_scalar_alu;
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned src1_reg : 5;
+        unsigned src2_reg : 5;
+        unsigned out_reg  : 5;
+        bool src2_imm     : 1;
+}
+midgard_reg_info;
+
+typedef enum {
+        midgard_jmp_writeout_op_branch_uncond = 1,
+        midgard_jmp_writeout_op_branch_cond = 2,
+        midgard_jmp_writeout_op_discard = 4,
+        midgard_jmp_writeout_op_writeout = 7,
+} midgard_jmp_writeout_op;
+
+typedef enum {
+        midgard_condition_write0 = 0,
+        midgard_condition_false = 1,
+        midgard_condition_true = 2,
+        midgard_condition_always = 3, /* Special for writeout/uncond discard */
+} midgard_condition;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == branch_uncond */
+        unsigned dest_tag : 4; /* tag of branch destination */
+        unsigned unknown : 2;
+        int offset : 7;
+}
+midgard_branch_uncond;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == branch_cond */
+        unsigned dest_tag : 4; /* tag of branch destination */
+        int offset : 7;
+        midgard_condition cond : 2;
+}
+midgard_branch_cond;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == branch_cond */
+        unsigned dest_tag : 4; /* tag of branch destination */
+        unsigned unknown : 2;
+        signed offset : 7;
+        unsigned zero : 16;
+        unsigned cond : 16;
+}
+midgard_branch_extended;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_jmp_writeout_op op : 3; /* == writeout */
+        unsigned unknown : 13;
+}
+midgard_writeout;
+
+/*
+ * Load/store words
+ */
+
+typedef enum {
+        midgard_op_ld_st_noop   = 0x03,
+        midgard_op_load_attr_16 = 0x95,
+        midgard_op_load_attr_32 = 0x94,
+        midgard_op_load_vary_16 = 0x99,
+        midgard_op_load_vary_32 = 0x98,
+        midgard_op_load_color_buffer_16 = 0x9D,
+        midgard_op_load_color_buffer_8 = 0xBA,
+        midgard_op_load_uniform_16 = 0xAC,
+        midgard_op_load_uniform_32 = 0xB0,
+        midgard_op_store_vary_16 = 0xD5,
+        midgard_op_store_vary_32 = 0xD4
+} midgard_load_store_op;
+
+typedef enum {
+        midgard_interp_centroid = 1,
+        midgard_interp_default = 2
+} midgard_interpolation;
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned zero1 : 4; /* Always zero */
+
+        /* Varying qualifiers, zero if not a varying */
+        unsigned flat    : 1;
+        unsigned is_varying : 1; /* Always one for varying, but maybe something else? */
+        midgard_interpolation interpolation : 2;
+
+        unsigned zero2 : 2; /* Always zero */
+}
+midgard_varying_parameter;
+
+typedef struct
+__attribute__((__packed__))
+{
+        midgard_load_store_op op : 8;
+        unsigned reg     : 5;
+        unsigned mask    : 4;
+        unsigned swizzle : 8;
+        unsigned unknown : 16;
+
+        unsigned varying_parameters : 10;
+
+        unsigned address : 9;
+}
+midgard_load_store_word;
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned type      : 4;
+        unsigned next_type : 4;
+        uint64_t word1     : 60;
+        uint64_t word2     : 60;
+}
+midgard_load_store;
+
+/* Texture pipeline results are in r28-r29 */
+#define REG_TEX_BASE 28
+
+/* Texture opcodes... maybe? */
+#define TEXTURE_OP_NORMAL 0x11
+#define TEXTURE_OP_TEXEL_FETCH 0x14
+
+/* Texture format types, found in format */
+#define TEXTURE_CUBE 0x00
+#define TEXTURE_2D 0x02
+#define TEXTURE_3D 0x03
+
+typedef struct
+__attribute__((__packed__))
+{
+        unsigned type      : 4;
+        unsigned next_type : 4;
+
+        unsigned op  : 6;
+        unsigned shadow    : 1;
+        unsigned unknown3  : 1;
+
+        /* A little obscure, but last is set for the last texture operation in
+         * a shader. cont appears to just be last's opposite (?). Yeah, I know,
+         * kind of funky.. BiOpen thinks it could do with memory hinting, or
+         * tile locking? */
+
+        unsigned cont  : 1;
+        unsigned last  : 1;
+
+        unsigned format    : 5;
+        unsigned has_offset : 1;
+
+        /* Like in Bifrost */
+        unsigned filter  : 1;
+
+        unsigned in_reg_select : 1;
+        unsigned in_reg_upper  : 1;
+
+        unsigned in_reg_swizzle_left : 2;
+        unsigned in_reg_swizzle_right : 2;
+
+        unsigned unknown1 : 2;
+
+        unsigned unknown8  : 4;
+
+        unsigned out_full  : 1;
+
+        /* Always 1 afaict... */
+        unsigned unknown7  : 2;
+
+        unsigned out_reg_select : 1;
+        unsigned out_upper : 1;
+
+        unsigned mask : 4;
+
+        unsigned unknown2  : 2;
+
+        unsigned swizzle  : 8;
+        unsigned unknown4  : 8;
+
+        unsigned unknownA  : 4;
+
+        unsigned offset_unknown1  : 1;
+        unsigned offset_reg_select : 1;
+        unsigned offset_reg_upper : 1;
+        unsigned offset_unknown4  : 1;
+        unsigned offset_unknown5  : 1;
+        unsigned offset_unknown6  : 1;
+        unsigned offset_unknown7  : 1;
+        unsigned offset_unknown8  : 1;
+        unsigned offset_unknown9  : 1;
+
+        unsigned unknownB  : 3;
+
+        /* Texture bias or LOD, depending on whether it is executed in a
+         * fragment/vertex shader respectively. Compute as int(2^8 * biasf).
+         *
+         * For texel fetch, this is the LOD as is. */
+        unsigned bias  : 8;
+
+        unsigned unknown9  : 8;
+
+        unsigned texture_handle : 16;
+        unsigned sampler_handle : 16;
+}
+midgard_texture_word;
+
+/* Opcode name table */
+
+static char *alu_opcode_names[256] = {
+        [midgard_alu_op_fadd]       = "fadd",
+        [midgard_alu_op_fmul]       = "fmul",
+        [midgard_alu_op_fmin]       = "fmin",
+        [midgard_alu_op_fmax]       = "fmax",
+        [midgard_alu_op_fmov]       = "fmov",
+        [midgard_alu_op_ffloor]     = "ffloor",
+        [midgard_alu_op_fceil]      = "fceil",
+        [midgard_alu_op_fdot3]      = "fdot3",
+        [midgard_alu_op_fdot3r]     = "fdot3r",
+        [midgard_alu_op_fdot4]      = "fdot4",
+        [midgard_alu_op_freduce]    = "freduce",
+        [midgard_alu_op_imin]       = "imin",
+        [midgard_alu_op_imax]       = "imax",
+        [midgard_alu_op_ishl]       = "ishl",
+        [midgard_alu_op_iasr]       = "iasr",
+        [midgard_alu_op_ilsr]       = "ilsr",
+        [midgard_alu_op_iadd]       = "iadd",
+        [midgard_alu_op_ishladd]    = "ishladd",
+        [midgard_alu_op_isub]       = "isub",
+        [midgard_alu_op_imul]       = "imul",
+        [midgard_alu_op_imov]       = "imov",
+        [midgard_alu_op_iand]       = "iand",
+        [midgard_alu_op_ior]        = "ior",
+        [midgard_alu_op_inot]       = "inot",
+        [midgard_alu_op_iandnot]    = "iandnot",
+        [midgard_alu_op_ixor]       = "ixor",
+        [midgard_alu_op_feq]        = "feq",
+        [midgard_alu_op_fne]        = "fne",
+        [midgard_alu_op_flt]        = "flt",
+        [midgard_alu_op_fle]        = "fle",
+        [midgard_alu_op_fball_eq]   = "fball_eq",
+        [midgard_alu_op_fbany_neq]  = "fbany_neq",
+        [midgard_alu_op_bball_eq]   = "bball_eq",
+        [midgard_alu_op_bbany_neq]  = "bbany_neq",
+        [midgard_alu_op_f2i]        = "f2i",
+        [midgard_alu_op_f2u]        = "f2u",
+        [midgard_alu_op_f2u8]       = "f2u8",
+        [midgard_alu_op_ieq]        = "ieq",
+        [midgard_alu_op_ine]        = "ine",
+        [midgard_alu_op_ilt]        = "ilt",
+        [midgard_alu_op_ile]        = "ile",
+        [midgard_alu_op_iball_eq]   = "iball_eq",
+        [midgard_alu_op_ball]       = "ball",
+        [midgard_alu_op_ibany_neq]  = "ibany_neq",
+        [midgard_alu_op_i2f]        = "i2f",
+        [midgard_alu_op_u2f]        = "u2f",
+        [midgard_alu_op_icsel]      = "icsel",
+        [midgard_alu_op_fcsel]      = "fcsel",
+        [midgard_alu_op_fatan_pt2]  = "fatan_pt2",
+        [midgard_alu_op_frcp]       = "frcp",
+        [midgard_alu_op_frsqrt]     = "frsqrt",
+        [midgard_alu_op_fsqrt]      = "fsqrt",
+        [midgard_alu_op_fexp2]      = "fexp2",
+        [midgard_alu_op_flog2]      = "flog2",
+        [midgard_alu_op_fsin]       = "fsin",
+        [midgard_alu_op_fcos]       = "fcos",
+        [midgard_alu_op_fatan2_pt1] = "fatan2_pt1"
+};
+
+static char *load_store_opcode_names[256] = {
+        [midgard_op_load_attr_16] = "ld_attr_16",
+        [midgard_op_load_attr_32] = "ld_attr_32",
+        [midgard_op_load_vary_16] = "ld_vary_16",
+        [midgard_op_load_vary_32] = "ld_vary_32",
+        [midgard_op_load_uniform_16] = "ld_uniform_16",
+        [midgard_op_load_uniform_32] = "ld_uniform_32",
+        [midgard_op_load_color_buffer_8] = "ld_color_buffer_8",
+        [midgard_op_load_color_buffer_16] = "ld_color_buffer_16",
+        [midgard_op_store_vary_16] = "st_vary_16",
+        [midgard_op_store_vary_32] = "st_vary_32"
+};
+
+#endif
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.c b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
new file mode 100644
index 0000000000..07e3513278
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.c
@@ -0,0 +1,3621 @@
+/*
+ * Copyright (C) 2018 Alyssa Rosenzweig <alyssa at rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <err.h>
+
+#include "compiler/glsl/glsl_to_nir.h"
+#include "compiler/nir_types.h"
+#include "main/imports.h"
+#include "compiler/nir/nir_builder.h"
+#include "util/half_float.h"
+#include "util/register_allocate.h"
+#include "util/u_dynarray.h"
+#include "util/list.h"
+#include "main/mtypes.h"
+
+#include "midgard.h"
+#include "midgard_nir.h"
+#include "midgard_compile.h"
+#include "helpers.h"
+
+#include "disassemble.h"
+
+/* Instruction arguments represented as block-local SSA indices, rather than
+ * registers. Negative values mean unused. */
+
+typedef struct {
+        int src0;
+        int src1;
+        int dest;
+
+        /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged
+         * in. Only valid for ALU ops. */
+        bool inline_constant;
+} ssa_args;
+
+/* Forward declare so midgard_branch can reference */
+struct midgard_block;
+
+/* Target types. Defaults to TARGET_GOTO (the type corresponding directly to
+ * the hardware), hence why that must be zero */
+
+#define TARGET_GOTO 0
+#define TARGET_BREAK 1
+#define TARGET_CONTINUE 2
+
+typedef struct midgard_branch {
+        /* If conditional, the condition is specified in r31.w */
+        bool conditional;
+
+        /* For conditionals, if this is true, we branch on FALSE. If false, we  branch on TRUE. */
+        bool invert_conditional;
+
+        /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */
+        unsigned target_type;
+
+        /* The actual target */
+        union {
+                int target_block;
+                int target_break;
+                int target_continue;
+        };
+} midgard_branch;
+
+/* Generic in-memory data type repesenting a single logical instruction, rather
+ * than a single instruction group. This is the preferred form for code gen.
+ * Multiple midgard_insturctions will later be combined during scheduling,
+ * though this is not represented in this structure.  Its format bridges
+ * the low-level binary representation with the higher level semantic meaning.
+ *
+ * Notably, it allows registers to be specified as block local SSA, for code
+ * emitted before the register allocation pass.
+ */
+
+typedef struct midgard_instruction {
+        /* Must be first for casting */
+        struct list_head link;
+
+        unsigned type; /* ALU, load/store, texture */
+
+        /* If the register allocator has not run yet... */
+        ssa_args ssa_args;
+
+        /* Special fields for an ALU instruction */
+        midgard_reg_info registers;
+
+        /* I.e. (1 << alu_bit) */
+        int unit;
+
+        bool has_constants;
+        float constants[4];
+        uint16_t inline_constant;
+        bool has_blend_constant;
+
+        bool compact_branch;
+        bool writeout;
+        bool prepacked_branch;
+
+        union {
+                midgard_load_store_word load_store;
+                midgard_vector_alu alu;
+                midgard_texture_word texture;
+                uint16_t br_compact;
+
+                /* General branch, rather than packed br_compact. Higher level
+                 * than the other components */
+                midgard_branch branch;
+        };
+} midgard_instruction;
+
+typedef struct midgard_block {
+        /* Link to next block. Must be first for mir_get_block */
+        struct list_head link;
+
+        /* List of midgard_instructions emitted for the current block */
+        struct list_head instructions;
+
+        bool is_scheduled;
+
+        /* List of midgard_bundles emitted (after the scheduler has run) */
+        struct util_dynarray bundles;
+
+        /* Number of quadwords _actually_ emitted, as determined after scheduling */
+        unsigned quadword_count;
+
+        struct midgard_block *next_fallthrough;
+} midgard_block;
+
+/* Helpers to generate midgard_instruction's using macro magic, since every
+ * driver seems to do it that way */
+
+#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__));
+
+#define M_LOAD_STORE(name, rname, uname) \
+	static midgard_instruction m_##name(unsigned ssa, unsigned address) { \
+		midgard_instruction i = { \
+			.type = TAG_LOAD_STORE_4, \
+			.ssa_args = { \
+				.rname = ssa, \
+				.uname = -1, \
+				.src1 = -1 \
+			}, \
+			.load_store = { \
+				.op = midgard_op_##name, \
+				.mask = 0xF, \
+				.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), \
+				.address = address \
+			} \
+		}; \
+		\
+		return i; \
+	}
+
+#define M_LOAD(name) M_LOAD_STORE(name, dest, src0)
+#define M_STORE(name) M_LOAD_STORE(name, src0, dest)
+
+const midgard_vector_alu_src blank_alu_src = {
+        .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+};
+
+const midgard_scalar_alu_src blank_scalar_alu_src = {
+        .full = true
+};
+
+/* Used for encoding the unused source of 1-op instructions */
+const midgard_vector_alu_src zero_alu_src = { 0 };
+
+/* Coerce structs to integer */
+
+static unsigned
+vector_alu_srco_unsigned(midgard_vector_alu_src src)
+{
+        unsigned u;
+        memcpy(&u, &src, sizeof(src));
+        return u;
+}
+
+/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs
+ * the corresponding Midgard source */
+
+static midgard_vector_alu_src
+vector_alu_modifiers(nir_alu_src *src)
+{
+        if (!src) return blank_alu_src;
+
+        midgard_vector_alu_src alu_src = {
+                .abs = src->abs,
+                .negate = src->negate,
+                .rep_low = 0,
+                .rep_high = 0,
+                .half = 0, /* TODO */
+                .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle)
+        };
+
+        return alu_src;
+}
+
+/* 'Intrinsic' move for misc aliasing uses independent of actual NIR ALU code */
+
+static midgard_instruction
+v_fmov(unsigned src, midgard_vector_alu_src mod, unsigned dest)
+{
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .ssa_args = {
+                        .src0 = SSA_UNUSED_1,
+                        .src1 = src,
+                        .dest = dest,
+                },
+                .alu = {
+                        .op = midgard_alu_op_fmov,
+                        .reg_mode = midgard_reg_mode_full,
+                        .dest_override = midgard_dest_override_none,
+                        .mask = 0xFF,
+                        .src1 = vector_alu_srco_unsigned(zero_alu_src),
+                        .src2 = vector_alu_srco_unsigned(mod)
+                },
+        };
+
+        return ins;
+}
+
+/* load/store instructions have both 32-bit and 16-bit variants, depending on
+ * whether we are using vectors composed of highp or mediump. At the moment, we
+ * don't support half-floats -- this requires changes in other parts of the
+ * compiler -- therefore the 16-bit versions are commented out. */
+
+//M_LOAD(load_attr_16);
+M_LOAD(load_attr_32);
+//M_LOAD(load_vary_16);
+M_LOAD(load_vary_32);
+//M_LOAD(load_uniform_16);
+M_LOAD(load_uniform_32);
+M_LOAD(load_color_buffer_8);
+//M_STORE(store_vary_16);
+M_STORE(store_vary_32);
+
+static midgard_instruction
+v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond)
+{
+        midgard_branch_cond branch = {
+                .op = op,
+                .dest_tag = tag,
+                .offset = offset,
+                .cond = cond
+        };
+
+        uint16_t compact;
+        memcpy(&compact, &branch, sizeof(branch));
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .unit = ALU_ENAB_BR_COMPACT,
+                .prepacked_branch = true,
+                .compact_branch = true,
+                .br_compact = compact
+        };
+
+        if (op == midgard_jmp_writeout_op_writeout)
+                ins.writeout = true;
+
+        return ins;
+}
+
+static midgard_instruction
+v_branch(bool conditional, bool invert)
+{
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .unit = ALU_ENAB_BR_COMPACT,
+                .compact_branch = true,
+                .branch = {
+                        .conditional = conditional,
+                        .invert_conditional = invert
+                }
+        };
+
+        return ins;
+}
+
+typedef struct midgard_bundle {
+        /* Tag for the overall bundle */
+        int tag;
+
+        /* Instructions contained by the bundle */
+        int instruction_count;
+        midgard_instruction instructions[5];
+
+        /* Bundle-wide ALU configuration */
+        int padding;
+        int control;
+        bool has_embedded_constants;
+        float constants[4];
+        bool has_blend_constant;
+
+        uint16_t register_words[8];
+        int register_words_count;
+
+        uint64_t body_words[8];
+        size_t body_size[8];
+        int body_words_count;
+} midgard_bundle;
+
+typedef struct compiler_context {
+        nir_shader *nir;
+        gl_shader_stage stage;
+
+        /* Is internally a blend shader? Depends on stage == FRAGMENT */
+        bool is_blend;
+
+        /* Tracking for blend constant patching */
+        int blend_constant_number;
+        int blend_constant_offset;
+
+        /* Current NIR function */
+        nir_function *func;
+
+        /* Unordered list of midgard_blocks */
+        int block_count;
+        struct list_head blocks;
+
+        midgard_block *initial_block;
+        midgard_block *previous_source_block;
+        midgard_block *final_block;
+
+        /* List of midgard_instructions emitted for the current block */
+        midgard_block *current_block;
+
+        /* The index corresponding to the current loop, e.g. for breaks/contineus */
+        int current_loop;
+
+        /* Constants which have been loaded, for later inlining */
+        struct hash_table_u64 *ssa_constants;
+
+        /* SSA indices to be outputted to corresponding varying offset */
+        struct hash_table_u64 *ssa_varyings;
+
+        /* SSA values / registers which have been aliased. Naively, these
+         * demand a fmov output; instead, we alias them in a later pass to
+         * avoid the wasted op.
+         *
+         * A note on encoding: to avoid dynamic memory management here, rather
+         * than ampping to a pointer, we map to the source index; the key
+         * itself is just the destination index. */
+
+        struct hash_table_u64 *ssa_to_alias;
+        struct set *leftover_ssa_to_alias;
+
+        /* Actual SSA-to-register for RA */
+        struct hash_table_u64 *ssa_to_register;
+
+        /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */
+        struct hash_table_u64 *hash_to_temp;
+        int temp_count;
+        int max_hash;
+
+        /* Uniform IDs for mdg */
+        struct hash_table_u64 *uniform_nir_to_mdg;
+        int uniform_count;
+
+        struct hash_table_u64 *varying_nir_to_mdg;
+        int varying_count;
+
+        /* Just the count of the max register used. Higher count => higher
+         * register pressure */
+        int work_registers;
+
+        /* Used for cont/last hinting. Increase when a tex op is added.
+         * Decrease when a tex op is removed. */
+        int texture_op_count;
+
+        /* Mapping of texture register -> SSA index for unaliasing */
+        int texture_index[2];
+
+        /* Count of special uniforms (viewport, etc) in vec4 units */
+        int special_uniforms;
+
+        /* If any path hits a discard instruction */
+        bool can_discard;
+
+        /* The number of uniforms allowable for the fast path */
+        int uniform_cutoff;
+
+        /* Count of instructions emitted from NIR overall, across all blocks */
+        int instruction_count;
+
+        /* Alpha ref value passed in */
+        float alpha_ref;
+
+        /* The index corresponding to the fragment output */
+        unsigned fragment_output;
+} compiler_context;
+
+/* Append instruction to end of current block */
+
+static midgard_instruction *
+mir_upload_ins(struct midgard_instruction ins)
+{
+        midgard_instruction *heap = malloc(sizeof(ins));
+        memcpy(heap, &ins, sizeof(ins));
+        return heap;
+}
+
+static void
+emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins)
+{
+        list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions);
+}
+
+static void
+mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins)
+{
+        list_addtail(&(mir_upload_ins(ins))->link, &tag->link);
+}
+
+static void
+mir_remove_instruction(struct midgard_instruction *ins)
+{
+        list_del(&ins->link);
+}
+
+static midgard_instruction*
+mir_prev_op(struct midgard_instruction *ins)
+{
+        return list_last_entry(&(ins->link), midgard_instruction, link);
+}
+
+static midgard_instruction*
+mir_next_op(struct midgard_instruction *ins)
+{
+        return list_first_entry(&(ins->link), midgard_instruction, link);
+}
+
+static midgard_block *
+mir_next_block(struct midgard_block *blk)
+{
+        return list_first_entry(&(blk->link), midgard_block, link);
+}
+
+
+#define mir_foreach_block(ctx, v) list_for_each_entry(struct midgard_block, v, &ctx->blocks, link) 
+#define mir_foreach_block_from(ctx, from, v) list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link)
+
+#define mir_foreach_instr(ctx, v) list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link) 
+#define mir_foreach_instr_safe(ctx, v) list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link) 
+#define mir_foreach_instr_in_block(block, v) list_for_each_entry(struct midgard_instruction, v, &block->instructions, link) 
+#define mir_foreach_instr_in_block_safe(block, v) list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link) 
+#define mir_foreach_instr_in_block_safe_rev(block, v) list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link) 
+#define mir_foreach_instr_in_block_from(block, v, from) list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link) 
+
+
+static midgard_instruction *
+mir_last_in_block(struct midgard_block *block)
+{
+        return list_last_entry(&block->instructions, struct midgard_instruction, link);
+}
+
+static midgard_block *
+mir_get_block(compiler_context *ctx, int idx)
+{
+        struct list_head *lst = &ctx->blocks;
+
+        while ((idx--) + 1)
+                lst = lst->next;
+
+        return (struct midgard_block *) lst;
+}
+
+/* Pretty printer for internal Midgard IR */
+
+static void
+print_mir_source(int source)
+{
+        if (source >= SSA_FIXED_MINIMUM) {
+                /* Specific register */
+                int reg = SSA_REG_FROM_FIXED(source);
+
+                /* TODO: Moving threshold */
+                if (reg > 16 && reg < 24)
+                        printf("u%d", 23 - reg);
+                else
+                        printf("r%d", reg);
+        } else {
+                printf("%d", source);
+        }
+}
+
+static void
+print_mir_instruction(midgard_instruction *ins)
+{
+        printf("\t");
+
+        switch (ins->type) {
+        case TAG_ALU_4: {
+                midgard_alu_op op = ins->alu.op;
+                const char *name = alu_opcode_names[op];
+
+                if (ins->unit)
+                        printf("%d.", ins->unit);
+
+                printf("%s", name ? name : "??");
+                break;
+        }
+
+        case TAG_LOAD_STORE_4: {
+                midgard_load_store_op op = ins->load_store.op;
+                const char *name = load_store_opcode_names[op];
+
+                assert(name);
+                printf("%s", name);
+                break;
+        }
+
+        case TAG_TEXTURE_4: {
+                printf("texture");
+                break;
+        }
+
+        default:
+                assert(0);
+        }
+
+        ssa_args *args = &ins->ssa_args;
+
+        printf(" %d, ", args->dest);
+
+        print_mir_source(args->src0);
+        printf(", ");
+
+        if (args->inline_constant)
+                printf("#%d", ins->inline_constant);
+        else
+                print_mir_source(args->src1);
+
+        if (ins->has_constants)
+                printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]);
+
+        printf("\n");
+}
+
+static void
+print_mir_block(midgard_block *block)
+{
+        printf("{\n");
+
+        mir_foreach_instr_in_block(block, ins) {
+                print_mir_instruction(ins);
+        }
+
+        printf("}\n");
+}
+
+
+
+static void
+attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name)
+{
+        ins->has_constants = true;
+        memcpy(&ins->constants, constants, 16);
+
+        /* If this is the special blend constant, mark this instruction */
+
+        if (ctx->is_blend && ctx->blend_constant_number == name)
+                ins->has_blend_constant = true;
+}
+
+static int
+glsl_type_size(const struct glsl_type *type)
+{
+        return glsl_count_attribute_slots(type, false);
+}
+
+/* Lower fdot2 to a vector multiplication followed by channel addition  */
+static void
+midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu)
+{
+        if (alu->op != nir_op_fdot2)
+                return;
+
+        b->cursor = nir_before_instr(&alu->instr);
+
+        nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0);
+        nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1);
+
+        nir_ssa_def *product = nir_fmul(b, src0, src1);
+
+        nir_ssa_def *sum = nir_fadd(b, 
+                        nir_channel(b, product, 0), 
+                        nir_channel(b, product, 1));
+
+        /* Replace the fdot2 with this sum */
+        nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum));
+}
+
+static bool
+midgard_nir_lower_fdot2(nir_shader *shader)
+{
+        bool progress = false;
+
+        nir_foreach_function(function, shader) {
+                if (!function->impl) continue;
+
+                nir_builder _b;
+                nir_builder *b = &_b;
+                nir_builder_init(b, function->impl);
+
+                nir_foreach_block(block, function->impl) {
+                        nir_foreach_instr_safe(instr, block) {
+                                if (instr->type != nir_instr_type_alu) continue;
+
+                                nir_alu_instr *alu = nir_instr_as_alu(instr);
+                                midgard_nir_lower_fdot2_body(b, alu);
+
+                                progress |= true;
+                        }
+                }
+
+                nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance);
+
+        }
+
+        return progress;
+}
+
+static void
+optimise_nir(nir_shader *nir)
+{
+        bool progress;
+
+        NIR_PASS(progress, nir, nir_lower_regs_to_ssa);
+        NIR_PASS(progress, nir, midgard_nir_lower_fdot2);
+
+        nir_lower_tex_options lower_tex_options = {
+                .lower_rect = true
+        };
+
+        NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options);
+
+        do {
+                progress = false;
+
+                NIR_PASS(progress, nir, midgard_nir_lower_algebraic);
+                NIR_PASS(progress, nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
+                NIR_PASS(progress, nir, nir_lower_var_copies);
+                NIR_PASS(progress, nir, nir_lower_vars_to_ssa);
+
+                NIR_PASS(progress, nir, nir_copy_prop);
+                NIR_PASS(progress, nir, nir_opt_dce);
+                NIR_PASS(progress, nir, nir_opt_dead_cf);
+                NIR_PASS(progress, nir, nir_opt_cse);
+                NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true);
+                NIR_PASS(progress, nir, nir_opt_algebraic);
+                NIR_PASS(progress, nir, nir_opt_constant_folding);
+                NIR_PASS(progress, nir, nir_opt_undef);
+                NIR_PASS(progress, nir, nir_opt_loop_unroll,
+                         nir_var_shader_in |
+                         nir_var_shader_out |
+                         nir_var_function_temp);
+
+                /* TODO: Enable vectorize when merged upstream */
+                // NIR_PASS(progress, nir, nir_opt_vectorize);
+        } while (progress);
+
+        /* Must be run at the end to prevent creation of fsin/fcos ops */
+        NIR_PASS(progress, nir, midgard_nir_scale_trig);
+
+        do {
+                progress = false;
+
+                NIR_PASS(progress, nir, nir_opt_dce);
+                NIR_PASS(progress, nir, nir_opt_algebraic);
+                NIR_PASS(progress, nir, nir_opt_constant_folding);
+                NIR_PASS(progress, nir, nir_copy_prop);
+        } while (progress);
+
+        NIR_PASS(progress, nir, nir_opt_algebraic_late);
+
+        /* Lower mods */
+        NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_all_source_mods);
+        NIR_PASS(progress, nir, nir_copy_prop);
+        NIR_PASS(progress, nir, nir_opt_dce);
+
+        /* Take us out of SSA */
+        NIR_PASS(progress, nir, nir_lower_locals_to_regs);
+        NIR_PASS(progress, nir, nir_convert_from_ssa, true);
+
+        /* We are a vector architecture; write combine where possible */
+        NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest);
+        NIR_PASS(progress, nir, nir_lower_vec_to_movs);
+
+        NIR_PASS(progress, nir, nir_opt_dce);
+}
+
+/* Front-half of aliasing the SSA slots, merely by inserting the flag in the
+ * appropriate hash table. Intentional off-by-one to avoid confusing NULL with
+ * r0. See the comments in compiler_context */
+
+static void
+alias_ssa(compiler_context *ctx, int dest, int src)
+{
+        _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1));
+        _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1));
+}
+
+/* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */
+
+static void
+unalias_ssa(compiler_context *ctx, int dest)
+{
+        _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1);
+        /* TODO: Remove from leftover or no? */
+}
+
+static void
+midgard_pin_output(compiler_context *ctx, int index, int reg)
+{
+        _mesa_hash_table_u64_insert(ctx->ssa_to_register, index + 1, (void *) ((uintptr_t) reg + 1));
+}
+
+static bool
+midgard_is_pinned(compiler_context *ctx, int index)
+{
+        return _mesa_hash_table_u64_search(ctx->ssa_to_register, index + 1) != NULL;
+}
+
+/* Do not actually emit a load; instead, cache the constant for inlining */
+
+static void
+emit_load_const(compiler_context *ctx, nir_load_const_instr *instr)
+{
+        nir_ssa_def def = instr->def;
+
+        float *v = ralloc_array(NULL, float, 4);
+        memcpy(v, &instr->value.f32, 4 * sizeof(float));
+        _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v);
+}
+
+/* Duplicate bits to convert sane 4-bit writemask to obscure 8-bit format (or
+ * do the inverse) */
+
+static unsigned
+expand_writemask(unsigned mask)
+{
+        unsigned o = 0;
+
+        for (int i = 0; i < 4; ++i)
+                if (mask & (1 << i))
+                        o |= (3 << (2 * i));
+
+        return o;
+}
+
+static unsigned
+squeeze_writemask(unsigned mask)
+{
+        unsigned o = 0;
+
+        for (int i = 0; i < 4; ++i)
+                if (mask & (3 << (2 * i)))
+                        o |= (1 << i);
+
+        return o;
+
+}
+
+/* Determines effective writemask, taking quirks and expansion into account */
+static unsigned
+effective_writemask(midgard_vector_alu *alu)
+{
+        /* Channel count is off-by-one to fit in two-bits (0 channel makes no
+         * sense) */
+
+        unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op]);
+
+        /* If there is a fixed channel count, construct the appropriate mask */
+
+        if (channel_count)
+                return (1 << channel_count) - 1;
+
+        /* Otherwise, just squeeze the existing mask */
+        return squeeze_writemask(alu->mask);
+}
+
+static unsigned
+find_or_allocate_temp(compiler_context *ctx, unsigned hash)
+{
+        if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM))
+                return hash;
+
+        unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1);
+
+        if (temp)
+                return temp - 1;
+
+        /* If no temp is find, allocate one */
+        temp = ctx->temp_count++;
+        ctx->max_hash = MAX2(ctx->max_hash, hash);
+
+        _mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1));
+
+        return temp;
+}
+
+static unsigned
+nir_src_index(nir_src *src)
+{
+        if (src->is_ssa)
+                return src->ssa->index;
+        else
+                return 4096 + src->reg.reg->index;
+}
+
+static unsigned
+nir_dest_index(nir_dest *dst)
+{
+        if (dst->is_ssa)
+                return dst->ssa.index;
+        else
+                return 4096 + dst->reg.reg->index;
+}
+
+static unsigned
+nir_alu_src_index(nir_alu_src *src)
+{
+        return nir_src_index(&src->src);
+}
+
+/* Midgard puts conditionals in r31.w; move an arbitrary source (the output of
+ * a conditional test) into that register */
+
+static void
+emit_condition(compiler_context *ctx, nir_src *src, bool for_branch)
+{
+        /* XXX: Force component correct */
+        int condition = nir_src_index(src);
+
+        const midgard_vector_alu_src alu_src = {
+                .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X),
+        };
+
+        /* There is no boolean move instruction. Instead, we simulate a move by
+         * ANDing the condition with itself to get it into r31.w */
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .unit = for_branch ? UNIT_SMUL : UNIT_SADD, /* TODO: DEDUCE THIS */
+                .ssa_args = {
+                        .src0 = condition,
+                        .src1 = condition,
+                        .dest = SSA_FIXED_REGISTER(31),
+                },
+                .alu = {
+                        .op = midgard_alu_op_iand,
+                        .reg_mode = midgard_reg_mode_full,
+                        .dest_override = midgard_dest_override_none,
+                        .mask = (0x3 << 6), /* w */
+                        .src1 = vector_alu_srco_unsigned(alu_src),
+                        .src2 = vector_alu_srco_unsigned(alu_src)
+                },
+        };
+
+        emit_mir_instruction(ctx, ins);
+}
+
+/* Components: Number/style of arguments:
+ * 	3: One-argument op with r24 (i2f, f2i)
+ * 	2: Standard two argument op (fadd, fmul)
+ * 	1: Flipped one-argument op (fmov, imov)
+ * 	0: Standard one-argument op (frcp)
+ * NIR: NIR instruction op.
+ * Op: Midgard instruction op.
+ */
+
+#define ALU_CASE(_components, nir, _op) \
+	case nir_op_##nir: \
+		components = _components; \
+		op = midgard_alu_op_##_op; \
+		break;
+
+static void
+emit_alu(compiler_context *ctx, nir_alu_instr *instr)
+{
+        bool is_ssa = instr->dest.dest.is_ssa;
+
+        unsigned dest = nir_dest_index(&instr->dest.dest);
+        unsigned nr_components = is_ssa ? instr->dest.dest.ssa.num_components : instr->dest.dest.reg.reg->num_components;
+
+        /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are
+         * supported. A few do not and are commented for now. Also, there are a
+         * number of NIR ops which Midgard does not support and need to be
+         * lowered, also TODO. This switch block emits the opcode and calling
+         * convention of the Midgard instruction; actual packing is done in
+         * emit_alu below */
+
+        unsigned op, components;
+
+        switch (instr->op) {
+                ALU_CASE(2, fadd, fadd);
+                ALU_CASE(2, fmul, fmul);
+                ALU_CASE(2, fmin, fmin);
+                ALU_CASE(2, fmax, fmax);
+                ALU_CASE(2, imin, imin);
+                ALU_CASE(2, imax, imax);
+                ALU_CASE(1, fmov, fmov);
+                ALU_CASE(0, ffloor, ffloor);
+                ALU_CASE(0, fceil, fceil);
+                ALU_CASE(2, fdot3, fdot3);
+                //ALU_CASE(2, fdot3r);
+                ALU_CASE(2, fdot4, fdot4);
+                //ALU_CASE(2, freduce);
+                ALU_CASE(2, iadd, iadd);
+                ALU_CASE(2, isub, isub);
+                ALU_CASE(2, imul, imul);
+
+                /* XXX: Use fmov, not imov, since imov was causing major
+                 * issues with texture precision? XXX research */
+                ALU_CASE(1, imov, fmov);
+
+                ALU_CASE(2, feq, feq);
+                ALU_CASE(2, fne, fne);
+                ALU_CASE(2, flt, flt);
+                ALU_CASE(2, ieq, ieq);
+                ALU_CASE(2, ine, ine);
+                ALU_CASE(2, ilt, ilt);
+                //ALU_CASE(2, icsel, icsel);
+                ALU_CASE(0, frcp, frcp);
+                ALU_CASE(0, frsq, frsqrt);
+                ALU_CASE(0, fsqrt, fsqrt);
+                ALU_CASE(0, fexp2, fexp2);
+                ALU_CASE(0, flog2, flog2);
+
+                ALU_CASE(3, f2i32, f2i);
+                ALU_CASE(3, f2u32, f2u);
+                ALU_CASE(3, i2f32, i2f);
+                ALU_CASE(3, u2f32, u2f);
+
+                ALU_CASE(0, fsin, fsin);
+                ALU_CASE(0, fcos, fcos);
+
+                ALU_CASE(2, iand, iand);
+                ALU_CASE(2, ior, ior);
+                ALU_CASE(2, ixor, ixor);
+                ALU_CASE(0, inot, inot);
+                ALU_CASE(2, ishl, ishl);
+                ALU_CASE(2, ishr, iasr);
+                ALU_CASE(2, ushr, ilsr);
+                //ALU_CASE(2, ilsr, ilsr);
+
+                ALU_CASE(2, ball_fequal4, fball_eq);
+                ALU_CASE(2, bany_fnequal4, fbany_neq);
+                ALU_CASE(2, ball_iequal4, iball_eq);
+                ALU_CASE(2, bany_inequal4, ibany_neq);
+
+        /* For greater-or-equal, we use less-or-equal and flip the
+         * arguments */
+
+        case nir_op_ige: {
+                components = 2;
+                op = midgard_alu_op_ile;
+
+                /* Swap via temporary */
+                nir_alu_src temp = instr->src[1];
+                instr->src[1] = instr->src[0];
+                instr->src[0] = temp;
+
+                break;
+        }
+
+        case nir_op_bcsel: {
+                components = 2;
+                op = midgard_alu_op_fcsel;
+
+                emit_condition(ctx, &instr->src[0].src, false);
+
+                /* The condition is the first argument; move the other
+                 * arguments up one to be a binary instruction for
+                 * Midgard */
+
+                memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src));
+                break;
+        }
+
+        /* We don't have a native b2f32 instruction. Instead, like many GPUs,
+         * we exploit booleans as 0/~0 for false/true, and correspondingly AND
+         * by 1.0 to do the type conversion. For the moment, prime us to emit:
+         *
+         * iand [whatever], #0
+         *
+         * At the end of emit_alu (as MIR), we'll fix-up the constant */
+
+        case nir_op_b2f32: {
+                op = midgard_alu_op_iand;
+                components = 0;
+                break;
+        }
+
+        default:
+                printf("Unhandled ALU op %s\n", nir_op_infos[instr->op].name);
+                assert(0);
+                return;
+        }
+
+        int _unit = alu_opcode_props[op];
+
+        /* Initialise fields common between scalar/vector instructions */
+        midgard_outmod outmod = instr->dest.saturate ? midgard_outmod_sat : midgard_outmod_none;
+
+        /* src0 will always exist afaik, but src1 will not for 1-argument
+         * instructions. The latter can only be fetched if the instruction
+         * needs it, or else we may segfault. */
+
+        unsigned src0 = nir_alu_src_index(&instr->src[0]);
+        unsigned src1 = components == 2 ? nir_alu_src_index(&instr->src[1]) : SSA_UNUSED_0;
+
+        /* Rather than use the instruction generation helpers, we do it
+         * ourselves here to avoid the mess */
+
+        midgard_instruction ins = {
+                .type = TAG_ALU_4,
+                .ssa_args = {
+                        .src0 = components == 3 || components == 2 || components == 0 ? src0 : SSA_UNUSED_1,
+                        .src1 = components == 2 ? src1 : components == 1 ? src0 : components == 0 ? SSA_UNUSED_0 : SSA_UNUSED_1,
+                        .dest = dest,
+                        .inline_constant = components == 0
+                }
+        };
+
+        nir_alu_src *nirmod0 = NULL;
+        nir_alu_src *nirmod1 = NULL;
+
+        if (components == 2) {
+                nirmod0 = &instr->src[0];
+                nirmod1 = &instr->src[1];
+        } else if (components == 1) {
+                nirmod1 = &instr->src[0];
+        } else if (components == 0) {
+                nirmod0 = &instr->src[0];
+        }
+
+        midgard_vector_alu alu = {
+                .op = op,
+                .reg_mode = midgard_reg_mode_full,
+                .dest_override = midgard_dest_override_none,
+                .outmod = outmod,
+
+                /* Writemask only valid for non-SSA NIR */
+                .mask = expand_writemask((1 << nr_components) - 1),
+
+                .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmod0)),
+                .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmod1)),
+        };
+
+        /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */
+
+        if (!is_ssa)
+                alu.mask &= expand_writemask(instr->dest.write_mask);
+
+        ins.alu = alu;
+
+        /* Late fixup for emulated instructions */
+
+        if (instr->op == nir_op_b2f32) {
+                /* Presently, our second argument is an inline #0 constant.
+                 * Switch over to an embedded 1.0 constant (that can't fit
+                 * inline, since we're 32-bit, not 16-bit like the inline
+                 * constants) */
+
+                ins.ssa_args.inline_constant = false;
+                ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+                ins.has_constants = true;
+                ins.constants[0] = 1.0;
+        }
+
+        if (_unit == UNIT_VLUT) {
+                /* To avoid duplicating the LUTs (we think?), LUT instructions can only
+                 * operate as if they were scalars. Lower them here by changing the
+                 * component. */
+
+                assert(components == 0);
+
+                uint8_t original_swizzle[4];
+                memcpy(original_swizzle, nirmod0->swizzle, sizeof(nirmod0->swizzle));
+
+                for (int i = 0; i < nr_components; ++i) {
+                        ins.alu.mask = (0x3) << (2 * i); /* Mask the associated component */
+
+                        for (int j = 0; j < 4; ++j)
+                                nirmod0->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */
+
+                        ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmod0));
+                        emit_mir_instruction(ctx, ins);
+                }
+        } else {
+                emit_mir_instruction(ctx, ins);
+        }
+}
+
+static void
+emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr)
+{
+        nir_const_value *const_offset;
+        unsigned offset, reg;
+
+        switch (instr->intrinsic) {
+        case nir_intrinsic_discard_if:
+                emit_condition(ctx, &instr->src[0], true);
+
+        /* fallthrough */
+
+        case nir_intrinsic_discard: {
+                midgard_condition cond = instr->intrinsic == nir_intrinsic_discard_if ?
+                                         midgard_condition_true : midgard_condition_always;
+
+                EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_discard, 0, 2, cond);
+                ctx->can_discard = true;
+                break;
+        }
+
+        case nir_intrinsic_load_uniform:
+        case nir_intrinsic_load_input:
+                const_offset = nir_src_as_const_value(instr->src[0]);
+                assert (const_offset && "no indirect inputs");
+
+                offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+
+                reg = nir_dest_index(&instr->dest);
+
+                if (instr->intrinsic == nir_intrinsic_load_uniform && !ctx->is_blend) {
+                        /* TODO: half-floats */
+
+                        int uniform_offset = 0;
+
+                        if (offset >= SPECIAL_UNIFORM_BASE) {
+                                /* XXX: Resolve which uniform */
+                                uniform_offset = 0;
+                        } else {
+                                /* Offset away from the special
+                                 * uniform block */
+
+                                void *entry = _mesa_hash_table_u64_search(ctx->uniform_nir_to_mdg, offset + 1);
+
+                                /* XXX */
+                                if (!entry) {
+                                        printf("WARNING: Unknown uniform %d\n", offset);
+                                        break;
+                                }
+
+                                uniform_offset = (uintptr_t) (entry) - 1;
+                                uniform_offset += ctx->special_uniforms;
+                        }
+
+                        if (uniform_offset < ctx->uniform_cutoff) {
+                                /* Fast path: For the first 16 uniform,
+                                 * accesses are 0-cycle, since they're
+                                 * just a register fetch in the usual
+                                 * case.  So, we alias the registers
+                                 * while we're still in SSA-space */
+
+                                int reg_slot = 23 - uniform_offset;
+                                alias_ssa(ctx, reg, SSA_FIXED_REGISTER(reg_slot));
+                        } else {
+                                /* Otherwise, read from the 'special'
+                                 * UBO to access higher-indexed
+                                 * uniforms, at a performance cost */
+
+                                midgard_instruction ins = m_load_uniform_32(reg, uniform_offset);
+
+                                /* TODO: Don't split */
+                                ins.load_store.varying_parameters = (uniform_offset & 7) << 7;
+                                ins.load_store.address = uniform_offset >> 3;
+
+                                ins.load_store.unknown = 0x1E00; /* xxx: what is this? */
+                                emit_mir_instruction(ctx, ins);
+                        }
+                } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) {
+                        /* XXX: Half-floats? */
+                        /* TODO: swizzle, mask */
+
+                        midgard_instruction ins = m_load_vary_32(reg, offset);
+
+                        midgard_varying_parameter p = {
+                                .is_varying = 1,
+                                .interpolation = midgard_interp_default,
+                                .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0
+                        };
+
+                        unsigned u;
+                        memcpy(&u, &p, sizeof(p));
+                        ins.load_store.varying_parameters = u;
+
+                        ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */
+                        emit_mir_instruction(ctx, ins);
+                } else if (ctx->is_blend && instr->intrinsic == nir_intrinsic_load_uniform) {
+                        /* Constant encoded as a pinned constant */
+
+                        midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg);
+                        ins.has_constants = true;
+                        ins.has_blend_constant = true;
+                        emit_mir_instruction(ctx, ins);
+                } else if (ctx->is_blend) {
+                        /* For blend shaders, a load might be
+                         * translated various ways depending on what
+                         * we're loading. Figure out how this is used */
+
+                        nir_variable *out = NULL;
+
+                        nir_foreach_variable(var, &ctx->nir->inputs) {
+                                int drvloc = var->data.driver_location;
+
+                                if (nir_intrinsic_base(instr) == drvloc) {
+                                        out = var;
+                                        break;
+                                }
+                        }
+
+                        assert(out);
+
+                        if (out->data.location == VARYING_SLOT_COL0) {
+                                /* Source color preloaded to r0 */
+
+                                midgard_pin_output(ctx, reg, 0);
+                        } else if (out->data.location == VARYING_SLOT_COL1) {
+                                /* Destination color must be read from framebuffer */
+
+                                midgard_instruction ins = m_load_color_buffer_8(reg, 0);
+                                ins.load_store.swizzle = 0; /* xxxx */
+
+                                /* Read each component sequentially */
+
+                                for (int c = 0; c < 4; ++c) {
+                                        ins.load_store.mask = (1 << c);
+                                        ins.load_store.unknown = c;
+                                        emit_mir_instruction(ctx, ins);
+                                }
+
+                                /* vadd.u2f hr2, abs(hr2), #0 */
+
+                                midgard_vector_alu_src alu_src = blank_alu_src;
+                                alu_src.abs = true;
+                                alu_src.half = true;
+
+                                midgard_instruction u2f = {
+                                        .type = TAG_ALU_4,
+                                        .ssa_args = {
+                                                .src0 = reg,
+                                                .src1 = SSA_UNUSED_0,
+                                                .dest = reg,
+                                                .inline_constant = true
+                                        },
+                                        .alu = {
+                                                .op = midgard_alu_op_u2f,
+                                                .reg_mode = midgard_reg_mode_half,
+                                                .dest_override = midgard_dest_override_none,
+                                                .mask = 0xF,
+                                                .src1 = vector_alu_srco_unsigned(alu_src),
+                                                .src2 = vector_alu_srco_unsigned(blank_alu_src),
+                                        }
+                                };
+
+                                emit_mir_instruction(ctx, u2f);
+
+                                /* vmul.fmul.sat r1, hr2, #0.00392151 */
+
+                                alu_src.abs = false;
+
+                                midgard_instruction fmul = {
+                                        .type = TAG_ALU_4,
+                                        .inline_constant = _mesa_float_to_half(1.0 / 255.0),
+                                        .ssa_args = {
+                                                .src0 = reg,
+                                                .dest = reg,
+                                                .src1 = SSA_UNUSED_0,
+                                                .inline_constant = true
+                                        },
+                                        .alu = {
+                                                .op = midgard_alu_op_fmul,
+                                                .reg_mode = midgard_reg_mode_full,
+                                                .dest_override = midgard_dest_override_none,
+                                                .outmod = midgard_outmod_sat,
+                                                .mask = 0xFF,
+                                                .src1 = vector_alu_srco_unsigned(alu_src),
+                                                .src2 = vector_alu_srco_unsigned(blank_alu_src),
+                                        }
+                                };
+
+                                emit_mir_instruction(ctx, fmul);
+                        } else {
+                                printf("Unknown input in blend shader\n");
+                                assert(0);
+                        }
+                } else if (ctx->stage == MESA_SHADER_VERTEX) {
+                        midgard_instruction ins = m_load_attr_32(reg, offset);
+                        ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */
+                        ins.load_store.mask = (1 << instr->num_components) - 1;
+                        emit_mir_instruction(ctx, ins);
+                } else {
+                        printf("Unknown load\n");
+                        assert(0);
+                }
+
+                break;
+
+        case nir_intrinsic_store_output:
+                const_offset = nir_src_as_const_value(instr->src[1]);
+                assert(const_offset && "no indirect outputs");
+
+                offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+
+                reg = nir_src_index(&instr->src[0]);
+
+                if (ctx->stage == MESA_SHADER_FRAGMENT) {
+                        /* gl_FragColor is not emitted with load/store
+                         * instructions. Instead, it gets plonked into
+                         * r0 at the end of the shader and we do the
+                         * framebuffer writeout dance. TODO: Defer
+                         * writes */
+
+                        midgard_pin_output(ctx, reg, 0);
+
+                        /* Save the index we're writing to for later reference
+                         * in the epilogue */
+
+                        ctx->fragment_output = reg;
+                } else if (ctx->stage == MESA_SHADER_VERTEX) {
+                        /* Varyings are written into one of two special
+                         * varying register, r26 or r27. The register itself is selected as the register
+                         * in the st_vary instruction, minus the base of 26. E.g. write into r27 and then call st_vary(1)
+                         *
+                         * Normally emitting fmov's is frowned upon,
+                         * but due to unique constraints of
+                         * REGISTER_VARYING, fmov emission + a
+                         * dedicated cleanup pass is the only way to
+                         * guarantee correctness when considering some
+                         * (common) edge cases XXX: FIXME */
+
+                        /* Look up how it was actually laid out */
+
+                        void *entry = _mesa_hash_table_u64_search(ctx->varying_nir_to_mdg, offset + 1);
+
+                        if (!entry) {
+                                printf("WARNING: skipping varying\n");
+                                break;
+                        }
+
+                        offset = (uintptr_t) (entry) - 1;
+
+                        /* If this varying corresponds to a constant (why?!),
+                         * emit that now since it won't get picked up by
+                         * hoisting (since there is no corresponding move
+                         * emitted otherwise) */
+
+                        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, reg + 1);
+
+                        if (constant_value) {
+                                /* Special case: emit the varying write
+                                 * directly to r26 (looks funny in asm but it's
+                                 * fine) and emit the store _now_. Possibly
+                                 * slightly slower, but this is a really stupid
+                                 * special case anyway (why on earth would you
+                                 * have a constant varying? Your own fault for
+                                 * slightly worse perf :P) */
+
+                                midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(26));
+                                attach_constants(ctx, &ins, constant_value, reg + 1);
+                                emit_mir_instruction(ctx, ins);
+
+                                midgard_instruction st = m_store_vary_32(SSA_FIXED_REGISTER(0), offset);
+                                st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
+                                emit_mir_instruction(ctx, st);
+                        } else {
+                                /* Do not emit the varying yet -- instead, just mark down that we need to later */
+
+                                _mesa_hash_table_u64_insert(ctx->ssa_varyings, reg + 1, (void *) ((uintptr_t) (offset + 1)));
+                        }
+                } else {
+                        printf("Unknown store\n");
+                        assert(0);
+                }
+
+                break;
+
+        case nir_intrinsic_load_alpha_ref_float:
+                assert(instr->dest.is_ssa);
+
+                float ref_value = ctx->alpha_ref;
+
+                float *v = ralloc_array(NULL, float, 4);
+                memcpy(v, &ref_value, sizeof(float));
+                _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v);
+                break;
+
+
+        default:
+                printf ("Unhandled intrinsic\n");
+                assert(0);
+                break;
+        }
+}
+
+static unsigned
+midgard_tex_format(enum glsl_sampler_dim dim)
+{
+        switch (dim) {
+        case GLSL_SAMPLER_DIM_2D:
+        case GLSL_SAMPLER_DIM_EXTERNAL:
+                return TEXTURE_2D;
+
+        case GLSL_SAMPLER_DIM_3D:
+                return TEXTURE_3D;
+
+        case GLSL_SAMPLER_DIM_CUBE:
+                return TEXTURE_CUBE;
+
+        default:
+                printf("Unknown sampler dim type\n");
+                assert(0);
+                return 0;
+        }
+}
+
+static void
+emit_tex(compiler_context *ctx, nir_tex_instr *instr)
+{
+        /* TODO */
+        //assert (!instr->sampler);
+        //assert (!instr->texture_array_size);
+        assert (instr->op == nir_texop_tex);
+
+        /* Allocate registers via a round robin scheme to alternate between the two registers */
+        int reg = ctx->texture_op_count & 1;
+        int in_reg = reg, out_reg = reg;
+
+        /* Make room for the reg */
+
+        if (ctx->texture_index[reg] > -1)
+                unalias_ssa(ctx, ctx->texture_index[reg]);
+
+        int texture_index = instr->texture_index;
+        int sampler_index = texture_index;
+
+        for (unsigned i = 0; i < instr->num_srcs; ++i) {
+                switch (instr->src[i].src_type) {
+                case nir_tex_src_coord: {
+                        int index = nir_src_index(&instr->src[i].src);
+
+                        midgard_vector_alu_src alu_src = blank_alu_src;
+                        alu_src.swizzle = (COMPONENT_Y << 2);
+
+                        midgard_instruction ins = v_fmov(index, alu_src, SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg));
+                        emit_mir_instruction(ctx, ins);
+
+                        //midgard_pin_output(ctx, index, REGISTER_TEXTURE_BASE + in_reg);
+
+                        break;
+                }
+
+                default: {
+                        printf("Unknown source type\n");
+                        //assert(0);
+                        break;
+                }
+                }
+        }
+
+        /* No helper to build texture words -- we do it all here */
+        midgard_instruction ins = {
+                .type = TAG_TEXTURE_4,
+                .texture = {
+                        .op = TEXTURE_OP_NORMAL,
+                        .format = midgard_tex_format(instr->sampler_dim),
+                        .texture_handle = texture_index,
+                        .sampler_handle = sampler_index,
+
+                        /* TODO: Don't force xyzw */
+                        .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W),
+                        .mask = 0xF,
+
+                        /* TODO: half */
+                        //.in_reg_full = 1,
+                        .out_full = 1,
+
+                        .filter = 1,
+
+                        /* Always 1 */
+                        .unknown7 = 1,
+
+                        /* Assume we can continue; hint it out later */
+                        .cont = 1,
+                }
+        };
+
+        /* Set registers to read and write from the same place */
+        ins.texture.in_reg_select = in_reg;
+        ins.texture.out_reg_select = out_reg;
+
+        /* TODO: Dynamic swizzle input selection, half-swizzles? */
+        if (instr->sampler_dim == GLSL_SAMPLER_DIM_3D) {
+                ins.texture.in_reg_swizzle_right = COMPONENT_X;
+                ins.texture.in_reg_swizzle_left = COMPONENT_Y;
+                //ins.texture.in_reg_swizzle_third = COMPONENT_Z;
+        } else {
+                ins.texture.in_reg_swizzle_left = COMPONENT_X;
+                ins.texture.in_reg_swizzle_right = COMPONENT_Y;
+                //ins.texture.in_reg_swizzle_third = COMPONENT_X;
+        }
+
+        emit_mir_instruction(ctx, ins);
+
+        /* Simultaneously alias the destination and emit a move for it. The move will be eliminated if possible */
+
+        int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(&instr->dest);
+        alias_ssa(ctx, o_index, SSA_FIXED_REGISTER(o_reg));
+        ctx->texture_index[reg] = o_index;
+
+        midgard_instruction ins2 = v_fmov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index);
+        emit_mir_instruction(ctx, ins2);
+
+        /* Used for .cont and .last hinting */
+        ctx->texture_op_count++;
+}
+
+static void
+emit_jump(compiler_context *ctx, nir_jump_instr *instr)
+{
+        switch (instr->type) {
+                case nir_jump_break: {
+                        /* Emit a branch out of the loop */
+                        struct midgard_instruction br = v_branch(false, false);
+                        br.branch.target_type = TARGET_BREAK;
+                        br.branch.target_break = ctx->current_loop;
+                        emit_mir_instruction(ctx, br);
+
+                        printf("break..\n");
+                        break;
+                }
+
+                default:
+                        printf("Unknown jump type %d\n", instr->type);
+                        break;
+        }
+}
+
+static void
+emit_instr(compiler_context *ctx, struct nir_instr *instr)
+{
+        switch (instr->type) {
+        case nir_instr_type_load_const:
+                emit_load_const(ctx, nir_instr_as_load_const(instr));
+                break;
+
+        case nir_instr_type_intrinsic:
+                emit_intrinsic(ctx, nir_instr_as_intrinsic(instr));
+                break;
+
+        case nir_instr_type_alu:
+                emit_alu(ctx, nir_instr_as_alu(instr));
+                break;
+
+        case nir_instr_type_tex:
+                emit_tex(ctx, nir_instr_as_tex(instr));
+                break;
+
+        case nir_instr_type_jump:
+                emit_jump(ctx, nir_instr_as_jump(instr));
+                break;
+
+        case nir_instr_type_ssa_undef:
+                /* Spurious */
+                break;
+
+        default:
+                printf("Unhandled instruction type\n");
+                break;
+        }
+}
+
+/* Determine the actual hardware from the index based on the RA results or special values */
+
+static int
+dealias_register(compiler_context *ctx, struct ra_graph *g, int reg, int maxreg)
+{
+        if (reg >= SSA_FIXED_MINIMUM)
+                return SSA_REG_FROM_FIXED(reg);
+
+        if (reg >= 0) {
+                assert(reg < maxreg);
+                int r = ra_get_node_reg(g, reg);
+                ctx->work_registers = MAX2(ctx->work_registers, r);
+                return r;
+        }
+
+        switch (reg) {
+        /* fmov style unused */
+        case SSA_UNUSED_0:
+                return REGISTER_UNUSED;
+
+        /* lut style unused */
+        case SSA_UNUSED_1:
+                return REGISTER_UNUSED;
+
+        default:
+                printf("Unknown SSA register alias %d\n", reg);
+                assert(0);
+                return 31;
+        }
+}
+
+static unsigned int
+midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data)
+{
+        /* Choose the first available register to minimise reported register pressure */
+
+        for (int i = 0; i < 16; ++i) {
+                if (BITSET_TEST(regs, i)) {
+                        return i;
+                }
+        }
+
+        assert(0);
+        return 0;
+}
+
+static bool
+midgard_is_live_in_instr(midgard_instruction *ins, int src)
+{
+        if (ins->ssa_args.src0 == src) return true;
+        if (ins->ssa_args.src1 == src) return true;
+
+        return false;
+}
+
+static bool
+is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src)
+{
+        /* Check the rest of the block for liveness */
+        mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) {
+                if (midgard_is_live_in_instr(ins, src))
+                        return true;
+        }
+
+        /* Check the rest of the blocks for liveness */
+        mir_foreach_block_from(ctx, mir_next_block(block), b) {
+                mir_foreach_instr_in_block(b, ins) {
+                        if (midgard_is_live_in_instr(ins, src))
+                                return true;
+                }
+        }
+
+        /* TODO: How does control flow interact in complex shaders? */
+
+        return false;
+}
+
+static void
+allocate_registers(compiler_context *ctx)
+{
+        /* First, initialize the RA */
+        struct ra_regs *regs = ra_alloc_reg_set(NULL, 32, true);
+
+        /* Create a primary (general purpose) class, as well as special purpose
+         * pipeline register classes */
+
+        int primary_class = ra_alloc_reg_class(regs);
+        int varying_class  = ra_alloc_reg_class(regs);
+
+        /* Add the full set of work registers */
+        int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0);
+        for (int i = 0; i < work_count; ++i)
+                ra_class_add_reg(regs, primary_class, i);
+
+        /* Add special registers */
+        ra_class_add_reg(regs, varying_class, REGISTER_VARYING_BASE);
+        ra_class_add_reg(regs, varying_class, REGISTER_VARYING_BASE + 1);
+
+        /* We're done setting up */
+        ra_set_finalize(regs, NULL);
+
+        /* Transform the MIR into squeezed index form */
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+
+                        ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0);
+                        ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1);
+                        ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest);
+                }
+
+                print_mir_block(block);
+        }
+
+        /* Let's actually do register allocation */
+        int nodes = ctx->temp_count;
+        struct ra_graph *g = ra_alloc_interference_graph(regs, nodes);
+
+        /* Set everything to the work register class, unless it has somewhere
+         * special to go */
+
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+
+                        if (ins->ssa_args.dest < 0) continue;
+
+                        if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
+
+                        int class = primary_class;
+
+                        ra_set_node_class(g, ins->ssa_args.dest, class);
+                }
+        }
+
+        for (int index = 0; index <= ctx->max_hash; ++index) {
+                unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_register, index + 1);
+
+                if (temp) {
+                        unsigned reg = temp - 1;
+                        int t = find_or_allocate_temp(ctx, index);
+                        ra_set_node_reg(g, t, reg);
+                }
+        }
+
+        /* Determine liveness */
+
+        int *live_start = malloc(nodes * sizeof(int));
+        int *live_end = malloc(nodes * sizeof(int));
+
+        /* Initialize as non-existent */
+
+        for (int i = 0; i < nodes; ++i) {
+                live_start[i] = live_end[i] = -1;
+        }
+
+        int d = 0;
+
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+
+                        if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) {
+                                /* If this destination is not yet live, it is now since we just wrote it */
+
+                                int dest = ins->ssa_args.dest;
+
+                                if (live_start[dest] == -1)
+                                        live_start[dest] = d;
+                        }
+
+                        /* Since we just used a source, the source might be
+                         * dead now. Scan the rest of the block for
+                         * invocations, and if there are none, the source dies
+                         * */
+
+                        int sources[2] = { ins->ssa_args.src0, ins->ssa_args.src1 };
+
+                        for (int src = 0; src < 2; ++src) {
+                                int s = sources[src];
+
+                                if (s < 0) continue;
+
+                                if (s >= SSA_FIXED_MINIMUM) continue;
+
+                                if (!is_live_after(ctx, block, ins, s)) {
+                                        live_end[s] = d;
+                                }
+                        }
+
+                        ++d;
+                }
+        }
+
+        /* If a node still hasn't been killed, kill it now */
+
+        for (int i = 0; i < nodes; ++i) {
+                /* live_start == -1 most likely indicates a pinned output */
+
+                if (live_end[i] == -1)
+                        live_end[i] = d;
+        }
+
+        /* Setup interference between nodes that are live at the same time */
+
+        for (int i = 0; i < nodes; ++i) {
+                for (int j = i + 1; j < nodes; ++j) {
+                        if (!(live_start[i] >= live_end[j] || live_start[j] >= live_end[i]))
+                                ra_add_node_interference(g, i, j);
+                }
+        }
+
+        ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL);
+
+        if (!ra_allocate(g)) {
+                printf("Error allocating registers\n");
+                assert(0);
+        }
+
+        /* Cleanup */
+        free(live_start);
+        free(live_end);
+
+        mir_foreach_block(ctx, block) {
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->compact_branch) continue;
+
+                        ssa_args args = ins->ssa_args;
+
+                        switch (ins->type) {
+                        case TAG_ALU_4:
+                                ins->registers.src1_reg = dealias_register(ctx, g, args.src0, nodes);
+
+                                ins->registers.src2_imm = args.inline_constant;
+
+                                if (args.inline_constant) {
+                                        /* Encode inline 16-bit constant as a vector by default */
+
+                                        ins->registers.src2_reg = ins->inline_constant >> 11;
+
+                                        int lower_11 = ins->inline_constant & ((1 << 12) - 1);
+
+                                        uint16_t imm = ((lower_11 >> 8) & 0x7) | ((lower_11 & 0xFF) << 3);
+                                        ins->alu.src2 = imm << 2;
+                                } else {
+                                        ins->registers.src2_reg = dealias_register(ctx, g, args.src1, nodes);
+                                }
+
+                                ins->registers.out_reg = dealias_register(ctx, g, args.dest, nodes);
+
+                                break;
+
+                        case TAG_LOAD_STORE_4: {
+                                if (OP_IS_STORE(ins->load_store.op)) {
+                                        /* TODO: use ssa_args for store_vary */
+                                        ins->load_store.reg = 0;
+                                } else {
+                                        bool has_dest = args.dest >= 0;
+                                        int ssa_arg = has_dest ? args.dest : args.src0;
+
+                                        ins->load_store.reg = dealias_register(ctx, g, ssa_arg, nodes);
+                                }
+
+                                break;
+                        }
+
+                        default:
+                                break;
+                        }
+                }
+        }
+}
+
+/* Midgard IR only knows vector ALU types, but we sometimes need to actually
+ * use scalar ALU instructions, for functional or performance reasons. To do
+ * this, we just demote vector ALU payloads to scalar. */
+
+static int
+component_from_mask(unsigned mask)
+{
+        for (int c = 0; c < 4; ++c) {
+                if (mask & (3 << (2 * c)))
+                        return c;
+        }
+
+        assert(0);
+        return 0;
+}
+
+static bool
+is_single_component_mask(unsigned mask)
+{
+        int components = 0;
+
+        for (int c = 0; c < 4; ++c)
+                if (mask & (3 << (2 * c)))
+                        components++;
+
+        return components == 1;
+}
+
+/* Create a mask of accessed components from a swizzle to figure out vector
+ * dependencies */
+
+static unsigned
+swizzle_to_access_mask(unsigned swizzle)
+{
+        unsigned component_mask = 0;
+
+        for (int i = 0; i < 4; ++i) {
+                unsigned c = (swizzle >> (2 * i)) & 3;
+                component_mask |= (1 << c);
+        }
+
+        return component_mask;
+}
+
+static unsigned
+vector_to_scalar_source(unsigned u)
+{
+        midgard_vector_alu_src v;
+        memcpy(&v, &u, sizeof(v));
+
+        midgard_scalar_alu_src s = {
+                .abs = v.abs,
+                .negate = v.negate,
+                .full = !v.half,
+                .component = (v.swizzle & 3) << 1
+        };
+
+        unsigned o;
+        memcpy(&o, &s, sizeof(s));
+
+        return o & ((1 << 6) - 1);
+}
+
+static midgard_scalar_alu
+vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins)
+{
+        /* The output component is from the mask */
+        midgard_scalar_alu s = {
+                .op = v.op,
+                .src1 = vector_to_scalar_source(v.src1),
+                .src2 = vector_to_scalar_source(v.src2),
+                .unknown = 0,
+                .outmod = v.outmod,
+                .output_full = 1, /* TODO: Half */
+                .output_component = component_from_mask(v.mask) << 1,
+        };
+
+        /* Inline constant is passed along rather than trying to extract it
+         * from v */
+
+        if (ins->ssa_args.inline_constant) {
+                uint16_t imm = 0;
+                int lower_11 = ins->inline_constant & ((1 << 12) - 1);
+                imm |= (lower_11 >> 9) & 3;
+                imm |= (lower_11 >> 6) & 4;
+                imm |= (lower_11 >> 2) & 0x38;
+                imm |= (lower_11 & 63) << 6;
+
+                s.src2 = imm;
+        }
+
+        return s;
+}
+
+/* Midgard prefetches instruction types, so during emission we need to
+ * lookahead too. Unless this is the last instruction, in which we return 1. Or
+ * if this is the second to last and the last is an ALU, then it's also 1... */
+
+#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 ||  \
+		     tag == TAG_ALU_12 || tag == TAG_ALU_16)
+
+#define EMIT_AND_COUNT(type, val) util_dynarray_append(emission, type, val); \
+				  bytes_emitted += sizeof(type)
+
+static void
+emit_binary_vector_instruction(midgard_instruction *ains,
+                               uint16_t *register_words, int *register_words_count,
+                               uint64_t *body_words, size_t *body_size, int *body_words_count,
+                               size_t *bytes_emitted)
+{
+        memcpy(&register_words[(*register_words_count)++], &ains->registers, sizeof(ains->registers));
+        *bytes_emitted += sizeof(midgard_reg_info);
+
+        body_size[*body_words_count] = sizeof(midgard_vector_alu);
+        memcpy(&body_words[(*body_words_count)++], &ains->alu, sizeof(ains->alu));
+        *bytes_emitted += sizeof(midgard_vector_alu);
+}
+
+/* Checks for an SSA data hazard between two adjacent instructions, keeping in
+ * mind that we are a vector architecture and we can write to different
+ * components simultaneously */
+
+static bool
+can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second)
+{
+        /* Each instruction reads some registers and writes to a register. See
+         * where the first writes */
+
+        /* Figure out where exactly we wrote to */
+        int source = first->ssa_args.dest;
+        int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF;
+
+        /* As long as the second doesn't read from the first, we're okay */
+        if (second->ssa_args.src0 == source) {
+                if (first->type == TAG_ALU_4) {
+                        /* Figure out which components we just read from */
+
+                        int q = second->alu.src1;
+                        midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+
+                        /* Check if there are components in common, and fail if so */
+                        if (swizzle_to_access_mask(m->swizzle) & source_mask)
+                                return false;
+                } else
+                        return false;
+
+        }
+
+        if (second->ssa_args.src1 == source)
+                return false;
+
+        /* Otherwise, it's safe in that regard. Another data hazard is both
+         * writing to the same place, of course */
+
+        if (second->ssa_args.dest == source) {
+                /* ...but only if the components overlap */
+                int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF;
+
+                if (dest_mask & source_mask)
+                        return false;
+        }
+
+        /* ...That's it */
+        return true;
+}
+
+/* Schedules, but does not emit, a single basic block. After scheduling, the
+ * final tag and size of the block are known, which are necessary for branching
+ * */
+
+static midgard_bundle
+schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip)
+{
+        int instructions_emitted = 0, instructions_consumed = -1;
+        midgard_bundle bundle = { 0 };
+
+        uint8_t tag = ins->type;
+
+        /* Default to the instruction's tag */
+        bundle.tag = tag;
+
+        switch (ins->type) {
+        case TAG_ALU_4: {
+                uint32_t control = 0;
+                size_t bytes_emitted = sizeof(control);
+
+                /* TODO: Constant combining */
+                int index = 0, last_unit = 0;
+
+                /* Previous instructions, for the purpose of parallelism */
+                midgard_instruction *segment[4] = {0};
+                int segment_size = 0;
+
+                instructions_emitted = -1;
+                midgard_instruction *pins = ins;
+
+                for (;;) {
+                        midgard_instruction *ains = pins;
+
+                        /* Advance instruction pointer */
+                        if (index) {
+                                ains = mir_next_op(pins);
+                                pins = ains;
+                        }
+
+                        /* Out-of-work condition */
+                        if ((struct list_head *) ains == &block->instructions)
+                                break;
+
+                        /* Ensure that the chain can continue */
+                        if (ains->type != TAG_ALU_4) break;
+
+                        /* According to the presentation "The ARM
+                         * Mali-T880 Mobile GPU" from HotChips 27,
+                         * there are two pipeline stages. Branching
+                         * position determined experimentally. Lines
+                         * are executed in parallel:
+                         *
+                         * [ VMUL ] [ SADD ]
+                         * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ]
+                         *
+                         * Verify that there are no ordering dependencies here.
+                         *
+                         * TODO: Allow for parallelism!!!
+                         */
+
+                        /* Pick a unit for it if it doesn't force a particular unit */
+
+                        int unit = ains->unit;
+
+                        if (!unit) {
+                                int op = ains->alu.op;
+                                int units = alu_opcode_props[op];
+
+                                /* TODO: Promotion of scalars to vectors */
+                                int vector = ((!is_single_component_mask(ains->alu.mask)) || ((units & UNITS_SCALAR) == 0)) && (units & UNITS_ANY_VECTOR);
+
+                                if (!vector)
+                                        assert(units & UNITS_SCALAR);
+
+                                if (vector) {
+                                        if (last_unit >= UNIT_VADD) {
+                                                if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        } else {
+                                                if ((units & UNIT_VMUL) && !(control & UNIT_VMUL))
+                                                        unit = UNIT_VMUL;
+                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+                                                        unit = UNIT_VADD;
+                                                else if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        }
+                                } else {
+                                        if (last_unit >= UNIT_VADD) {
+                                                if ((units & UNIT_SMUL) && !(control & UNIT_SMUL))
+                                                        unit = UNIT_SMUL;
+                                                else if (units & UNIT_VLUT)
+                                                        unit = UNIT_VLUT;
+                                                else
+                                                        break;
+                                        } else {
+                                                if ((units & UNIT_SADD) && !(control & UNIT_SADD))
+                                                        unit = UNIT_SADD;
+                                                else if (units & UNIT_SMUL)
+                                                        unit = UNIT_SMUL;
+                                                else if ((units & UNIT_VADD) && !(control & UNIT_VADD))
+                                                        unit = UNIT_VADD;
+                                                else
+                                                        break;
+                                        }
+                                }
+
+                                assert(unit & units);
+                        }
+
+                        /* Late unit check, this time for encoding (not parallelism) */
+                        if (unit <= last_unit) break;
+
+                        /* Clear the segment */
+                        if (last_unit < UNIT_VADD && unit >= UNIT_VADD)
+                                segment_size = 0;
+
+                        /* Check for data hazards */
+                        int has_hazard = false;
+
+                        for (int s = 0; s < segment_size; ++s)
+                                if (!can_run_concurrent_ssa(segment[s], ains))
+                                        has_hazard = true;
+
+                        if (has_hazard)
+                                break;
+
+                        /* We're good to go -- emit the instruction */
+                        ains->unit = unit;
+
+                        segment[segment_size++] = ains;
+
+                        /* Only one set of embedded constants per
+                         * bundle possible; if we have more, we must
+                         * break the chain early, unfortunately */
+
+                        if (ains->has_constants) {
+                                if (bundle.has_embedded_constants) {
+                                        /* ...but if there are already
+                                         * constants but these are the
+                                         * *same* constants, we let it
+                                         * through */
+
+                                        if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants)))
+                                                break;
+                                } else {
+                                        bundle.has_embedded_constants = true;
+                                        memcpy(bundle.constants, ains->constants, sizeof(bundle.constants));
+
+                                        /* If this is a blend shader special constant, track it for patching */
+                                        if (ains->has_blend_constant)
+                                                bundle.has_blend_constant = true;
+                                }
+                        }
+
+                        if (ains->unit & UNITS_ANY_VECTOR) {
+                                emit_binary_vector_instruction(ains, bundle.register_words,
+                                                               &bundle.register_words_count, bundle.body_words,
+                                                               bundle.body_size, &bundle.body_words_count, &bytes_emitted);
+                        } else if (ains->compact_branch) {
+                                /* All of r0 has to be written out
+                                 * along with the branch writeout.
+                                 * (slow!) */
+
+                                if (ains->writeout) {
+                                        if (index == 0) {
+                                                midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
+                                                ins.unit = UNIT_VMUL;
+
+                                                control |= ins.unit;
+
+                                                emit_binary_vector_instruction(&ins, bundle.register_words,
+                                                                               &bundle.register_words_count, bundle.body_words,
+                                                                               bundle.body_size, &bundle.body_words_count, &bytes_emitted);
+                                        } else {
+                                                /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies*/
+                                                bool written_late = false;
+                                                bool components[4] = { 0 };
+                                                uint16_t register_dep_mask = 0;
+                                                uint16_t written_mask = 0;
+
+                                                midgard_instruction *qins = ins;
+                                                for (int t = 0; t < index; ++t) {
+                                                        if (qins->registers.out_reg != 0) {
+                                                                /* Mark down writes */
+
+                                                                written_mask |= (1 << qins->registers.out_reg);
+                                                        } else {
+                                                                /* Mark down the register dependencies for errata check */
+
+                                                                if (qins->registers.src1_reg < 16)
+                                                                        register_dep_mask |= (1 << qins->registers.src1_reg);
+
+                                                                if (qins->registers.src2_reg < 16)
+                                                                        register_dep_mask |= (1 << qins->registers.src2_reg);
+
+                                                                int mask = qins->alu.mask;
+
+                                                                for (int c = 0; c < 4; ++c)
+                                                                        if (mask & (0x3 << (2 * c)))
+                                                                                components[c] = true;
+
+                                                                /* ..but if the writeout is too late, we have to break up anyway... for some reason */
+
+                                                                if (qins->unit == UNIT_VLUT)
+                                                                        written_late = true;
+                                                        }
+
+                                                        /* Advance instruction pointer */
+                                                        qins = mir_next_op(qins);
+                                                }
+
+
+                                                /* ERRATA (?): In a bundle ending in a fragment writeout, the register dependencies of r0 cannot be written within this bundle (discovered in -bshading:shading=phong) */
+                                                if (register_dep_mask & written_mask) {
+                                                        printf("ERRATA WORKAROUND: Breakup for writeout dependency masks %X vs %X (common %X)\n", register_dep_mask, written_mask, register_dep_mask & written_mask);
+                                                        break;
+                                                }
+
+                                                if (written_late)
+                                                        break;
+
+                                                /* If even a single component is not written, break it up (conservative check). */
+                                                bool breakup = false;
+
+                                                for (int c = 0; c < 4; ++c)
+                                                        if (!components[c])
+                                                                breakup = true;
+
+                                                if (breakup)
+                                                        break;
+
+                                                /* Otherwise, we're free to proceed */
+                                        }
+                                }
+
+                                bundle.body_size[bundle.body_words_count] = sizeof(ains->br_compact);
+                                memcpy(&bundle.body_words[bundle.body_words_count++], &ains->br_compact, sizeof(ains->br_compact));
+                                bytes_emitted += sizeof(ains->br_compact);
+                        } else {
+                                memcpy(&bundle.register_words[bundle.register_words_count++], &ains->registers, sizeof(ains->registers));
+                                bytes_emitted += sizeof(midgard_reg_info);
+
+                                bundle.body_size[bundle.body_words_count] = sizeof(midgard_scalar_alu);
+                                bundle.body_words_count++;
+                                bytes_emitted += sizeof(midgard_scalar_alu);
+                        }
+
+                        /* Defer marking until after writing to allow for break */
+                        control |= ains->unit;
+                        last_unit = ains->unit;
+                        ++instructions_emitted;
+                        ++index;
+                }
+
+                /* Bubble up the number of instructions for skipping */
+                instructions_consumed = index - 1;
+
+                int padding = 0;
+
+                /* Pad ALU op to nearest word */
+
+                if (bytes_emitted & 15) {
+                        padding = 16 - (bytes_emitted & 15);
+                        bytes_emitted += padding;
+                }
+
+                /* Constants must always be quadwords */
+                if (bundle.has_embedded_constants)
+                        bytes_emitted += 16;
+
+                /* Size ALU instruction for tag */
+                bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1;
+                bundle.padding = padding;
+                bundle.control = bundle.tag | control;
+
+                break;
+        }
+
+        case TAG_LOAD_STORE_4: {
+                /* Load store instructions have two words at once. If
+                 * we only have one queued up, we need to NOP pad.
+                 * Otherwise, we store both in succession to save space
+                 * and cycles -- letting them go in parallel -- skip
+                 * the next. The usefulness of this optimisation is
+                 * greatly dependent on the quality of the instruction
+                 * scheduler.
+                 */
+
+                midgard_instruction *next_op = mir_next_op(ins);
+
+                if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) {
+                        /* As the two operate concurrently, make sure
+                         * they are not dependent */
+
+                        if (can_run_concurrent_ssa(ins, next_op) || true) {
+                                /* Skip ahead, since it's redundant with the pair */
+                                instructions_consumed = 1 + (instructions_emitted++);
+                        }
+                }
+
+                break;
+        }
+
+#if 0
+
+        case TAG_TEXTURE_4:
+                /* TODO: Schedule texture ops */
+                break;
+#endif
+
+        default:
+                /* XXX: What happens with textures? */
+                break;
+        }
+
+        /* Copy the instructions into the bundle */
+        bundle.instruction_count = instructions_emitted + 1;
+
+        int used_idx = 0;
+
+        midgard_instruction *uins = ins;
+        for (int i = 0; used_idx < bundle.instruction_count; ++i) {
+                bundle.instructions[used_idx++] = *uins;
+                uins = mir_next_op(uins);
+        }
+
+        *skip = (instructions_consumed == -1) ? instructions_emitted : instructions_consumed;
+
+        return bundle;
+}
+
+static int
+quadword_size(int tag)
+{
+        switch (tag) {
+        case TAG_ALU_4:
+                return 1;
+
+        case TAG_ALU_8:
+                return 2;
+
+        case TAG_ALU_12:
+                return 3;
+
+        case TAG_ALU_16:
+                return 4;
+
+        case TAG_LOAD_STORE_4:
+                return 1;
+
+        case TAG_TEXTURE_4:
+                return 1;
+
+        default:
+                assert(0);
+                return 0;
+        }
+}
+
+/* Schedule a single block by iterating its instruction to create bundles.
+ * While we go, tally about the bundle sizes to compute the block size. */
+
+static void
+schedule_block(compiler_context *ctx, midgard_block *block)
+{
+        util_dynarray_init(&block->bundles, NULL);
+
+        block->quadword_count = 0;
+
+        mir_foreach_instr_in_block(block, ins) {
+                int skip;
+                midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip);
+                util_dynarray_append(&block->bundles, midgard_bundle, bundle);
+
+                if (bundle.has_blend_constant) {
+                        /* TODO: Multiblock? */
+                        int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1;
+                        ctx->blend_constant_offset = quadwords_within_block * 0x10;
+                }
+
+                while(skip--)
+                        ins = mir_next_op(ins);
+
+                block->quadword_count += quadword_size(bundle.tag);
+        }
+
+        block->is_scheduled = true;
+}
+
+static void
+schedule_program(compiler_context *ctx)
+{
+        allocate_registers(ctx);
+
+        mir_foreach_block(ctx, block) {
+                schedule_block(ctx, block);
+        }
+}
+
+/* After everything is scheduled, emit whole bundles at a time */
+
+static void
+emit_binary_bundle(compiler_context *ctx, midgard_bundle *bundle, struct util_dynarray *emission, int next_tag)
+{
+        int lookahead = next_tag << 4;
+
+        switch (bundle->tag) {
+        case TAG_ALU_4:
+        case TAG_ALU_8:
+        case TAG_ALU_12:
+        case TAG_ALU_16: {
+                /* Actually emit each component */
+                util_dynarray_append(emission, uint32_t, bundle->control | lookahead);
+
+                for (int i = 0; i < bundle->register_words_count; ++i)
+                        util_dynarray_append(emission, uint16_t, bundle->register_words[i]);
+
+                /* Emit body words based on the instructions bundled */
+                for (int i = 0; i < bundle->instruction_count; ++i) {
+                        midgard_instruction *ins = &bundle->instructions[i];
+
+                        if (ins->unit & UNITS_ANY_VECTOR) {
+                                memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins->alu, sizeof(midgard_vector_alu));
+                        } else if (ins->compact_branch) {
+                                /* Dummy move, XXX DRY */
+                                if ((i == 0) && ins->writeout) {
+                                        midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0));
+                                        memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins.alu, sizeof(midgard_vector_alu));
+                                }
+
+                                memcpy(util_dynarray_grow(emission, sizeof(ins->br_compact)), &ins->br_compact, sizeof(ins->br_compact));
+                        } else {
+                                /* Scalar */
+                                midgard_scalar_alu scalarised = vector_to_scalar_alu(ins->alu, ins);
+                                memcpy(util_dynarray_grow(emission, sizeof(scalarised)), &scalarised, sizeof(scalarised));
+                        }
+                }
+
+                /* Emit padding (all zero) */
+                memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding);
+
+                /* Tack on constants */
+
+                if (bundle->has_embedded_constants) {
+                        util_dynarray_append(emission, float, bundle->constants[0]);
+                        util_dynarray_append(emission, float, bundle->constants[1]);
+                        util_dynarray_append(emission, float, bundle->constants[2]);
+                        util_dynarray_append(emission, float, bundle->constants[3]);
+                }
+
+                break;
+        }
+
+        case TAG_LOAD_STORE_4: {
+                /* One or two composing instructions */
+
+                uint64_t current64, next64 = LDST_NOP;
+
+                memcpy(&current64, &bundle->instructions[0].load_store, sizeof(current64));
+
+                if (bundle->instruction_count == 2)
+                        memcpy(&next64, &bundle->instructions[1].load_store, sizeof(next64));
+
+                midgard_load_store instruction = {
+                        .type = bundle->tag,
+                        .next_type = next_tag,
+                        .word1 = current64,
+                        .word2 = next64
+                };
+
+                util_dynarray_append(emission, midgard_load_store, instruction);
+
+                break;
+        }
+
+        case TAG_TEXTURE_4: {
+                /* Texture instructions are easy, since there is no
+                 * pipelining nor VLIW to worry about. We may need to set the .last flag */
+
+                midgard_instruction *ins = &bundle->instructions[0];
+
+                ins->texture.type = TAG_TEXTURE_4;
+                ins->texture.next_type = next_tag;
+
+                ctx->texture_op_count--;
+
+                if (!ctx->texture_op_count) {
+                        ins->texture.cont = 0;
+                        ins->texture.last = 1;
+                }
+
+                util_dynarray_append(emission, midgard_texture_word, ins->texture);
+                break;
+        }
+
+        default:
+                printf("Unknown midgard instruction type\n");
+                assert(0);
+                break;
+        }
+}
+
+
+/* ALU instructions can inline or embed constants, which decreases register
+ * pressure and saves space. */
+
+#define CONDITIONAL_ATTACH(src) { \
+	void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \
+\
+	if (entry) { \
+		attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \
+		alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \
+	} \
+}
+
+static void
+inline_alu_constants(compiler_context *ctx)
+{
+        mir_foreach_instr(ctx, alu) {
+                /* Other instructions cannot inline constants */
+                if (alu->type != TAG_ALU_4) continue;
+
+                /* If there is already a constant here, we can do nothing */
+                if (alu->has_constants) continue;
+
+                CONDITIONAL_ATTACH(src0);
+
+                if (!alu->has_constants) {
+                        CONDITIONAL_ATTACH(src1)
+                } else if (!alu->inline_constant) {
+                        /* Corner case: _two_ vec4 constants, for instance with a
+                         * csel. For this case, we can only use a constant
+                         * register for one, we'll have to emit a move for the
+                         * other. Note, if both arguments are constants, then
+                         * necessarily neither argument depends on the value of
+                         * any particular register. As the destination register
+                         * will be wiped, that means we can spill the constant
+                         * to the destination register.
+                         */
+
+                        void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1);
+                        unsigned scratch = alu->ssa_args.dest;
+
+                        if (entry) {
+                                midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch);
+                                attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1);
+
+                                /* Force a break XXX Defer r31 writes */
+                                ins.unit = UNIT_VLUT;
+
+                                /* Set the source */
+                                alu->ssa_args.src1 = scratch;
+
+                                /* Inject us -before- the last instruction which set r31 */
+                                mir_insert_instruction_before(mir_prev_op(alu), ins);
+                        }
+                }
+        }
+}
+
+/* Midgard supports two types of constants, embedded constants (128-bit) and
+ * inline constants (16-bit). Sometimes, especially with scalar ops, embedded
+ * constants can be demoted to inline constants, for space savings and
+ * sometimes a performance boost */
+
+static void
+embedded_to_inline_constant(compiler_context *ctx)
+{
+        mir_foreach_instr(ctx, ins) {
+                if (!ins->has_constants) continue;
+
+                if (ins->ssa_args.inline_constant) continue;
+
+                /* Blend constants must not be inlined by definition */
+                if (ins->has_blend_constant) continue;
+
+                /* src1 cannot be an inline constant due to encoding
+                 * restrictions. So, if possible we try to flip the arguments
+                 * in that case */
+
+                int op = ins->alu.op;
+
+                if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
+                        /* Flip based on op. Fallthrough intentional */
+
+                        switch (op) {
+                        /* These ops require an operational change to flip their arguments TODO */
+                        case midgard_alu_op_flt:
+                        case midgard_alu_op_fle:
+                        case midgard_alu_op_ilt:
+                        case midgard_alu_op_ile:
+                        case midgard_alu_op_fcsel:
+                        case midgard_alu_op_icsel:
+                        case midgard_alu_op_isub:
+                                printf("Missed non-commutative flip (%s)\n", alu_opcode_names[op]);
+                                break;
+
+                        /* These ops are commutative and Just Flip */
+                        case midgard_alu_op_fne:
+                        case midgard_alu_op_fadd:
+                        case midgard_alu_op_fmul:
+                        case midgard_alu_op_fmin:
+                        case midgard_alu_op_fmax:
+                        case midgard_alu_op_iadd:
+                        case midgard_alu_op_imul:
+                        case midgard_alu_op_feq:
+                        case midgard_alu_op_ieq:
+                        case midgard_alu_op_ine:
+                        case midgard_alu_op_iand:
+                        case midgard_alu_op_ior:
+                        case midgard_alu_op_ixor:
+                                /* Flip the SSA numbers */
+                                ins->ssa_args.src0 = ins->ssa_args.src1;
+                                ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT);
+
+                                /* And flip the modifiers */
+
+                                unsigned src_temp;
+
+                                src_temp = ins->alu.src2;
+                                ins->alu.src2 = ins->alu.src1;
+                                ins->alu.src1 = src_temp;
+
+                        default:
+                                break;
+                        }
+                }
+
+                if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) {
+                        /* Extract the source information */
+
+                        midgard_vector_alu_src *src;
+                        int q = ins->alu.src2;
+                        midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q;
+                        src = m;
+
+                        /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */
+                        int component = src->swizzle & 3;
+
+                        /* Scale constant appropriately, if we can legally */
+                        uint16_t scaled_constant = 0;
+
+                        /* XXX: Check legality */
+                        if (midgard_is_integer_op(op)) {
+                                /* TODO: Inline integer */
+                                continue;
+
+                                unsigned int *iconstants = (unsigned int *) ins->constants;
+                                scaled_constant = (uint16_t) iconstants[component];
+
+                                /* Constant overflow after resize */
+                                if (scaled_constant != iconstants[component])
+                                        continue;
+                        } else {
+                                scaled_constant = _mesa_float_to_half((float) ins->constants[component]);
+                        }
+
+                        /* We don't know how to handle these with a constant */
+
+                        if (src->abs || src->negate || src->half || src->rep_low || src->rep_high) {
+                                printf("Bailing inline constant...\n");
+                                continue;
+                        }
+
+                        /* Make sure that the constant is not itself a
+                         * vector by checking if all accessed values
+                         * (by the swizzle) are the same. */
+
+                        uint32_t *cons = (uint32_t *) ins->constants;
+                        uint32_t value = cons[component];
+
+                        bool is_vector = false;
+                        unsigned mask = effective_writemask(&ins->alu);
+
+                        for (int c = 1; c < 4; ++c) {
+                                /* We only care if this component is actually used */
+                                if (!(mask & (1 << c)))
+                                        continue;
+
+                                uint32_t test = cons[(src->swizzle >> (2 * c)) & 3];
+
+                                if (test != value) {
+                                        is_vector = true;
+                                        break;
+                                }
+                        }
+
+                        if (is_vector)
+                                continue;
+
+                        /* Get rid of the embedded constant */
+                        ins->has_constants = false;
+                        ins->ssa_args.src1 = SSA_UNUSED_0;
+                        ins->ssa_args.inline_constant = true;
+                        ins->inline_constant = scaled_constant;
+                }
+        }
+}
+
+/* Map normal SSA sources to other SSA sources / fixed registers (like
+ * uniforms) */
+
+static void
+map_ssa_to_alias(compiler_context *ctx, int *ref)
+{
+        unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1);
+
+        if (alias) {
+                /* Remove entry in leftovers to avoid a redunant fmov */
+
+                struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1)));
+
+                if (leftover)
+                        _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover);
+
+                /* Assign the alias map */
+                *ref = alias - 1;
+                return;
+        }
+}
+
+#define AS_SRC(to, u) \
+	int q##to = ins->alu.src2; \
+	midgard_vector_alu_src *to = (midgard_vector_alu_src *) &q##to;
+
+/* Removing unused moves is necessary to clean up the texture pipeline results.
+ *
+ * To do so, we find moves in the MIR. We check if their destination is live later. If it's not, the move is redundant. */
+
+static void
+midgard_eliminate_orphan_moves(compiler_context *ctx, midgard_block *block)
+{
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_ALU_4) continue;
+
+                if (ins->alu.op != midgard_alu_op_fmov) continue;
+
+                if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue;
+
+                if (midgard_is_pinned(ctx, ins->ssa_args.dest)) continue;
+
+                if (is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue;
+
+                mir_remove_instruction(ins);
+        }
+}
+
+/* The following passes reorder MIR instructions to enable better scheduling */
+
+static void
+midgard_pair_load_store(compiler_context *ctx, midgard_block *block)
+{
+        mir_foreach_instr_in_block_safe(block, ins) {
+                if (ins->type != TAG_LOAD_STORE_4) continue;
+
+                /* We've found a load/store op. Check if next is also load/store. */
+                midgard_instruction *next_op = mir_next_op(ins);
+                if (&next_op->link != &block->instructions) {
+                        if (next_op->type == TAG_LOAD_STORE_4) {
+                                /* If so, we're done since we're a pair */
+                                ins = mir_next_op(ins);
+                                continue;
+                        }
+
+                        /* Maximum search distance to pair, to avoid register pressure disasters */
+                        int search_distance = 8;
+
+                        /* Otherwise, we have an orphaned load/store -- search for another load */
+                        mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) {
+                                /* Terminate search if necessary */
+                                if (!(search_distance--)) break;
+
+                                if (c->type != TAG_LOAD_STORE_4) continue;
+
+                                if (OP_IS_STORE(c->load_store.op)) continue;
+
+                                /* We found one! Move it up to pair and remove it from the old location */
+
+                                mir_insert_instruction_before(ins, *c);
+                                mir_remove_instruction(c);
+
+                                break;
+                        }
+                }
+        }
+}
+
+/* Emit varying stores late */
+
+static void
+midgard_emit_store(compiler_context *ctx, midgard_block *block) {
+        /* Iterate in reverse to get the final write, rather than the first */
+
+        mir_foreach_instr_in_block_safe_rev(block, ins) {
+                /* Check if what we just wrote needs a store */
+                int idx = ins->ssa_args.dest;
+                uintptr_t varying = ((uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_varyings, idx + 1));
+
+                if (!varying) continue;
+
+                varying -= 1;
+
+                /* We need to store to the appropriate varying, so emit the
+                 * move/store */
+
+                /* TODO: Integrate with special purpose RA (and scheduler?) */
+                bool high_varying_register = false;
+
+                midgard_instruction mov = v_fmov(idx, blank_alu_src, SSA_FIXED_REGISTER(REGISTER_VARYING_BASE + high_varying_register));
+
+                midgard_instruction st = m_store_vary_32(SSA_FIXED_REGISTER(high_varying_register), varying);
+                st.load_store.unknown = 0x1E9E; /* XXX: What is this? */
+
+                mir_insert_instruction_before(mir_next_op(ins), st);
+                mir_insert_instruction_before(mir_next_op(ins), mov);
+
+                /* We no longer need to store this varying */
+                _mesa_hash_table_u64_remove(ctx->ssa_varyings, idx + 1);
+        }
+}
+
+/* If there are leftovers after the below pass, emit actual fmov
+ * instructions for the slow-but-correct path */
+
+static void
+emit_leftover_move(compiler_context *ctx)
+{
+        set_foreach(ctx->leftover_ssa_to_alias, leftover) {
+                int base = ((uintptr_t) leftover->key) - 1;
+                int mapped = base;
+
+                map_ssa_to_alias(ctx, &mapped);
+                EMIT(fmov, mapped, blank_alu_src, base);
+        }
+}
+
+static void
+actualise_ssa_to_alias(compiler_context *ctx)
+{
+        mir_foreach_instr(ctx, ins) {
+                map_ssa_to_alias(ctx, &ins->ssa_args.src0);
+                map_ssa_to_alias(ctx, &ins->ssa_args.src1);
+        }
+
+        emit_leftover_move(ctx);
+}
+
+/* Vertex shaders do not write gl_Position as is; instead, they write a
+ * transformed screen space position as a varying. See section 12.5 "Coordinate
+ * Transformation" of the ES 3.2 full specification for details.
+ *
+ * This transformation occurs early on, as NIR and prior to optimisation, in
+ * order to take advantage of NIR optimisation passes of the transform itself.
+ * */
+
+static void
+write_transformed_position(nir_builder *b, nir_src input_point_src, int uniform_no)
+{
+        nir_ssa_def *input_point = nir_ssa_for_src(b, input_point_src, 4);
+
+        /* Get viewport from the uniforms */
+        nir_intrinsic_instr *load;
+        load = nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_uniform);
+        load->num_components = 4;
+        load->src[0] = nir_src_for_ssa(nir_imm_int(b, uniform_no));
+        nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL);
+        nir_builder_instr_insert(b, &load->instr);
+
+        /* Formatted as <width, height, centerx, centery> */
+        nir_ssa_def *viewport_vec4 = &load->dest.ssa;
+        nir_ssa_def *viewport_width_2 = nir_channel(b, viewport_vec4, 0);
+        nir_ssa_def *viewport_height_2 = nir_channel(b, viewport_vec4, 1);
+        nir_ssa_def *viewport_offset = nir_channels(b, viewport_vec4, 0x8 | 0x4);
+
+        /* XXX: From uniforms? */
+        nir_ssa_def *depth_near = nir_imm_float(b, 0.0);
+        nir_ssa_def *depth_far = nir_imm_float(b, 1.0);
+
+        /* World space to normalised device coordinates */
+
+        nir_ssa_def *w_recip = nir_frcp(b, nir_channel(b, input_point, 3));
+        nir_ssa_def *ndc_point = nir_fmul(b, nir_channels(b, input_point, 0x7), w_recip);
+
+        /* Normalised device coordinates to screen space */
+
+        nir_ssa_def *viewport_multiplier = nir_vec2(b, viewport_width_2, viewport_height_2);
+        nir_ssa_def *viewport_xy = nir_fadd(b, nir_fmul(b, nir_channels(b, ndc_point, 0x3), viewport_multiplier), viewport_offset);
+
+        nir_ssa_def *depth_multiplier = nir_fmul(b, nir_fsub(b, depth_far, depth_near), nir_imm_float(b, 0.5f));
+        nir_ssa_def *depth_offset     = nir_fmul(b, nir_fadd(b, depth_far, depth_near), nir_imm_float(b, 0.5f));
+        nir_ssa_def *screen_depth     = nir_fadd(b, nir_fmul(b, nir_channel(b, ndc_point, 2), depth_multiplier), depth_offset);
+
+        /* gl_Position will be written out in screenspace xyz, with w set to
+         * the reciprocal we computed earlier. The transformed w component is
+         * then used for perspective-correct varying interpolation */
+
+        nir_ssa_def *screen_space = nir_vec4(b,
+                                             nir_channel(b, viewport_xy, 0),
+                                             nir_channel(b, viewport_xy, 1),
+                                             screen_depth,
+                                             nir_fabs(b, w_recip));
+
+        /* Finally, write out the transformed values to the varying */
+
+        nir_intrinsic_instr *store;
+        store = nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+        store->num_components = 4;
+        nir_intrinsic_set_base(store, 0);
+        nir_intrinsic_set_write_mask(store, 0xf);
+        store->src[0].ssa = screen_space;
+        store->src[0].is_ssa = true;
+        store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+        nir_builder_instr_insert(b, &store->instr);
+}
+
+static void
+transform_position_writes(nir_shader *shader)
+{
+        nir_foreach_function(func, shader) {
+                nir_foreach_block(block, func->impl) {
+                        nir_foreach_instr_safe(instr, block) {
+                                if (instr->type != nir_instr_type_intrinsic) continue;
+
+                                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+                                nir_variable *out = NULL;
+
+                                switch (intr->intrinsic) {
+                                case nir_intrinsic_store_output:
+                                        /* already had i/o lowered.. lookup the matching output var: */
+                                        nir_foreach_variable(var, &shader->outputs) {
+                                                int drvloc = var->data.driver_location;
+
+                                                if (nir_intrinsic_base(intr) == drvloc) {
+                                                        out = var;
+                                                        break;
+                                                }
+                                        }
+
+                                        break;
+
+                                default:
+                                        break;
+                                }
+
+                                if (!out) continue;
+
+                                if (out->data.mode != nir_var_shader_out)
+                                        continue;
+
+                                if (out->data.location != VARYING_SLOT_POS)
+                                        continue;
+
+                                nir_builder b;
+                                nir_builder_init(&b, func->impl);
+                                b.cursor = nir_before_instr(instr);
+
+                                write_transformed_position(&b, intr->src[0], UNIFORM_VIEWPORT);
+                                nir_instr_remove(instr);
+                        }
+                }
+        }
+}
+
+static void
+emit_fragment_epilogue(compiler_context *ctx)
+{
+        /* Special case: writing out constants requires us to include the move
+         * explicitly now, so shove it into r0 */
+
+        void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1);
+
+        if (constant_value) {
+                midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0));
+                attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1);
+                emit_mir_instruction(ctx, ins);
+        }
+
+        /* Perform the actual fragment writeout. We have two writeout/branch
+         * instructions, forming a loop until writeout is successful as per the
+         * docs. TODO: gl_FragDepth */
+
+        EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
+        EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
+}
+
+/* For the blend epilogue, we need to convert the blended fragment vec4 (stored
+ * in r0) to a RGBA8888 value by scaling and type converting. We then output it
+ * with the int8 analogue to the fragment epilogue */
+
+static void
+emit_blend_epilogue(compiler_context *ctx)
+{
+        /* vmul.fmul.none.fulllow hr48, r0, #255 */
+
+        midgard_instruction scale = {
+                .type = TAG_ALU_4,
+                .unit = UNIT_VMUL,
+                .inline_constant = _mesa_float_to_half(255.0),
+                .ssa_args = {
+                        .src0 = SSA_FIXED_REGISTER(0),
+                        .src1 = SSA_UNUSED_0,
+                        .dest = SSA_FIXED_REGISTER(24),
+                        .inline_constant = true
+                },
+                .alu = {
+                        .op = midgard_alu_op_fmul,
+                        .reg_mode = midgard_reg_mode_full,
+                        .dest_override = midgard_dest_override_lower,
+                        .mask = 0xFF,
+                        .src1 = vector_alu_srco_unsigned(blank_alu_src),
+                        .src2 = vector_alu_srco_unsigned(blank_alu_src),
+                }
+        };
+
+        emit_mir_instruction(ctx, scale);
+
+        /* vadd.f2u8.pos.low hr0, hr48, #0 */
+
+        midgard_vector_alu_src alu_src = blank_alu_src;
+        alu_src.half = true;
+
+        midgard_instruction f2u8 = {
+                .type = TAG_ALU_4,
+                .ssa_args = {
+                        .src0 = SSA_FIXED_REGISTER(24),
+                        .src1 = SSA_UNUSED_0,
+                        .dest = SSA_FIXED_REGISTER(0),
+                        .inline_constant = true
+                },
+                .alu = {
+                        .op = midgard_alu_op_f2u8,
+                        .reg_mode = midgard_reg_mode_half,
+                        .dest_override = midgard_dest_override_lower,
+                        .outmod = midgard_outmod_pos,
+                        .mask = 0xF,
+                        .src1 = vector_alu_srco_unsigned(alu_src),
+                        .src2 = vector_alu_srco_unsigned(blank_alu_src),
+                }
+        };
+
+        emit_mir_instruction(ctx, f2u8);
+
+        /* vmul.imov.quarter r0, r0, r0 */
+
+        midgard_instruction imov_8 = {
+                .type = TAG_ALU_4,
+                .ssa_args = {
+                        .src0 = SSA_UNUSED_1,
+                        .src1 = SSA_FIXED_REGISTER(0),
+                        .dest = SSA_FIXED_REGISTER(0),
+                },
+                .alu = {
+                        .op = midgard_alu_op_imov,
+                        .reg_mode = midgard_reg_mode_quarter,
+                        .dest_override = midgard_dest_override_none,
+                        .mask = 0xFF,
+                        .src1 = vector_alu_srco_unsigned(blank_alu_src),
+                        .src2 = vector_alu_srco_unsigned(blank_alu_src),
+                }
+        };
+
+        /* Emit branch epilogue with the 8-bit move as the source */
+
+        emit_mir_instruction(ctx, imov_8);
+        EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always);
+
+        emit_mir_instruction(ctx, imov_8);
+        EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always);
+}
+
+static midgard_block *
+emit_block(compiler_context *ctx, nir_block *block)
+{
+        midgard_block *this_block = malloc(sizeof(midgard_block));
+        list_addtail(&this_block->link, &ctx->blocks);
+
+        this_block->is_scheduled = false;
+        ++ctx->block_count;
+
+        ctx->texture_index[0] = -1;
+        ctx->texture_index[1] = -1;
+
+        /* Set up current block */
+        list_inithead(&this_block->instructions);
+        ctx->current_block = this_block;
+
+        nir_foreach_instr(instr, block) {
+                emit_instr(ctx, instr);
+                ++ctx->instruction_count;
+        }
+
+        inline_alu_constants(ctx);
+        embedded_to_inline_constant(ctx);
+
+        /* Perform heavylifting for aliasing */
+        actualise_ssa_to_alias(ctx);
+
+        midgard_emit_store(ctx, this_block);
+        midgard_eliminate_orphan_moves(ctx, this_block);
+        midgard_pair_load_store(ctx, this_block);
+
+        /* Append fragment shader epilogue (value writeout) */
+        if (ctx->stage == MESA_SHADER_FRAGMENT) {
+                if (block == nir_impl_last_block(ctx->func->impl)) {
+                        if (ctx->is_blend)
+                                emit_blend_epilogue(ctx);
+                        else
+                                emit_fragment_epilogue(ctx);
+                }
+        }
+
+        /* Fallthrough save */
+        this_block->next_fallthrough = ctx->previous_source_block;
+
+        if (block == nir_start_block(ctx->func->impl))
+                ctx->initial_block = this_block;
+
+        if (block == nir_impl_last_block(ctx->func->impl))
+                ctx->final_block = this_block;
+
+        /* Allow the next control flow to access us retroactively, for
+         * branching etc */
+        ctx->current_block = this_block;
+
+        /* Document the fallthrough chain */
+        ctx->previous_source_block = this_block;
+
+        return this_block;
+}
+
+static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list);
+
+static void
+emit_if(struct compiler_context *ctx, nir_if *nif)
+{
+        /* Conditional branches expect the condition in r31.w; emit a move for
+         * that in the _previous_ block (which is the current block). */
+        emit_condition(ctx, &nif->condition, true);
+
+        /* Speculatively emit the branch, but we can't fill it in until later */
+        EMIT(branch, true, true);
+        midgard_instruction *then_branch = mir_last_in_block(ctx->current_block);
+
+        /* Emit the two subblocks */
+        midgard_block *then_block = emit_cf_list(ctx, &nif->then_list);
+
+        /* Emit a jump from the end of the then block to the end of the else */
+        EMIT(branch, false, false);
+        midgard_instruction *then_exit = mir_last_in_block(ctx->current_block);
+
+        /* Emit second block, and check if it's empty */
+
+        int else_idx = ctx->block_count;
+        int count_in = ctx->instruction_count;
+        midgard_block *else_block = emit_cf_list(ctx, &nif->else_list);
+
+        /* Now that we have the subblocks emitted, fix up the branches */
+
+        assert(then_block);
+        assert(else_block);
+
+
+        if (ctx->instruction_count == count_in) {
+                /* The else block is empty, so don't emit an exit jump */
+                mir_remove_instruction(then_exit);
+                then_branch->branch.target_block = else_idx + 1;
+        } else {
+                then_branch->branch.target_block = else_idx;
+                then_exit->branch.target_block = else_idx + 1;
+        }
+}
+
+static void
+emit_loop(struct compiler_context *ctx, nir_loop *nloop)
+{
+        /* Remember where we are */
+        midgard_block *start_block = ctx->current_block;
+
+        /* Allocate a loop number for this. TODO: Nested loops. Instead of a
+         * single current_loop variable, maybe we need a stack */
+
+        int loop_idx = ++ctx->current_loop;
+
+        /* Get index from before the body so we can loop back later */
+        int start_idx = ctx->block_count;
+
+        /* Emit the body itself */
+        emit_cf_list(ctx, &nloop->body);
+
+        /* Branch back to loop back */
+        struct midgard_instruction br_back = v_branch(false, false);
+        br_back.branch.target_block = start_idx;
+        emit_mir_instruction(ctx, br_back);
+
+        /* Find the index of the block about to follow us (note: we don't add
+         * one; blocks are 0-indexed so we get a fencepost problem) */
+        int break_block_idx = ctx->block_count;
+
+        /* Fix up the break statements we emitted to point to the right place,
+         * now that we can allocate a block number for them */
+
+        list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) {
+                print_mir_block(block);
+                mir_foreach_instr_in_block(block, ins) {
+                        if (ins->type != TAG_ALU_4) continue;
+                        if (!ins->compact_branch) continue;
+                        if (ins->prepacked_branch) continue;
+
+                        /* We found a branch -- check the type to see if we need to do anything */
+                        if (ins->branch.target_type != TARGET_BREAK) continue;
+
+                        /* It's a break! Check if it's our break */
+                        if (ins->branch.target_break != loop_idx) continue;
+
+                        /* Okay, cool, we're breaking out of this loop.
+                         * Rewrite from a break to a goto */
+
+                        ins->branch.target_type = TARGET_GOTO;
+                        ins->branch.target_block = break_block_idx;
+                }
+        }
+}
+
+static midgard_block *
+emit_cf_list(struct compiler_context *ctx, struct exec_list *list)
+{
+        midgard_block *start_block = NULL;
+
+        foreach_list_typed(nir_cf_node, node, node, list) {
+                switch (node->type) {
+                case nir_cf_node_block: {
+                        midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node));
+
+                        if (!start_block)
+                                start_block = block;
+
+                        break;
+                }
+
+                case nir_cf_node_if:
+                        emit_if(ctx, nir_cf_node_as_if(node));
+                        break;
+
+                case nir_cf_node_loop:
+                        emit_loop(ctx, nir_cf_node_as_loop(node));
+                        break;
+
+                case nir_cf_node_function:
+                        assert(0);
+                        break;
+                }
+        }
+
+        return start_block;
+}
+
+int
+midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend)
+{
+        struct util_dynarray *compiled = &program->compiled;
+
+        compiler_context ictx = {
+                .nir = nir,
+                .stage = nir->info.stage,
+
+                .is_blend = is_blend,
+                .blend_constant_offset = -1,
+
+                .alpha_ref = program->alpha_ref
+        };
+
+        compiler_context *ctx = &ictx;
+
+        /* TODO: Decide this at runtime */
+        ctx->uniform_cutoff = 8;
+
+        switch (ctx->stage) {
+        case MESA_SHADER_VERTEX:
+                ctx->special_uniforms = 1;
+                break;
+
+        default:
+                ctx->special_uniforms = 0;
+                break;
+        }
+
+        /* Append epilogue uniforms if necessary. The cmdstream depends on
+         * these being at the -end-; see assign_var_locations. */
+
+        if (ctx->stage == MESA_SHADER_VERTEX) {
+                nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "viewport");
+        }
+
+        /* Assign var locations early, so the epilogue can use them if necessary */
+
+        nir_assign_var_locations(&nir->outputs, &nir->num_outputs, glsl_type_size);
+        nir_assign_var_locations(&nir->inputs, &nir->num_inputs, glsl_type_size);
+        nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, glsl_type_size);
+
+        /* Initialize at a global (not block) level hash tables */
+
+        ctx->ssa_constants = _mesa_hash_table_u64_create(NULL);
+        ctx->ssa_varyings = _mesa_hash_table_u64_create(NULL);
+        ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL);
+        ctx->ssa_to_register = _mesa_hash_table_u64_create(NULL);
+        ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL);
+        ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
+
+        /* Assign actual uniform location, skipping over samplers */
+
+        ctx->uniform_nir_to_mdg = _mesa_hash_table_u64_create(NULL);
+
+        nir_foreach_variable(var, &nir->uniforms) {
+                if (glsl_get_base_type(var->type) == GLSL_TYPE_SAMPLER) continue;
+
+                unsigned length = glsl_get_aoa_size(var->type);
+
+                if (!length) {
+                        length = glsl_get_length(var->type);
+                }
+
+                if (!length) {
+                        length = glsl_get_matrix_columns(var->type);
+                }
+
+                for (int col = 0; col < length; ++col) {
+                        int id = ctx->uniform_count++;
+                        _mesa_hash_table_u64_insert(ctx->uniform_nir_to_mdg, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1)));
+                }
+        }
+
+        if (ctx->stage == MESA_SHADER_VERTEX) {
+                ctx->varying_nir_to_mdg = _mesa_hash_table_u64_create(NULL);
+
+                /* First, collect the special varyings */
+                nir_foreach_variable(var, &nir->outputs) {
+                        if (var->data.location == VARYING_SLOT_POS) {
+                                /* Set position first, always. It takes up two
+                                 * spots, the latter one is de facto unused (at
+                                 * least from the shader's perspective), we
+                                 * just need to skip over the spot*/
+
+                                _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + 1, (void *) ((uintptr_t) (0 + 1)));
+                                ctx->varying_count = MAX2(ctx->varying_count, 2);
+                        } else if (var->data.location == VARYING_SLOT_PSIZ) {
+                                /* Set point size second (third, see above) */
+                                _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + 1, (void *) ((uintptr_t) (2 + 1)));
+                                ctx->varying_count = MAX2(ctx->varying_count, 3);
+
+                                program->writes_point_size = true;
+                        }
+                }
+
+                /* Now, collect normal varyings */
+ 
+                nir_foreach_variable(var, &nir->outputs) {
+                        if (var->data.location == VARYING_SLOT_POS || var->data.location == VARYING_SLOT_PSIZ) continue;
+
+                        for (int col = 0; col < glsl_get_matrix_columns(var->type); ++col) {
+                                int id = ctx->varying_count++;
+                                _mesa_hash_table_u64_insert(ctx->varying_nir_to_mdg, var->data.driver_location + col + 1, (void *) ((uintptr_t) (id + 1)));
+                        }
+                }
+        }
+
+
+
+        /* Lower vars -- not I/O -- before epilogue */
+
+        NIR_PASS_V(nir, nir_lower_var_copies);
+        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+        NIR_PASS_V(nir, nir_split_var_copies);
+        NIR_PASS_V(nir, nir_lower_var_copies);
+        NIR_PASS_V(nir, nir_lower_global_vars_to_local);
+        NIR_PASS_V(nir, nir_lower_var_copies);
+        NIR_PASS_V(nir, nir_lower_vars_to_ssa);
+        NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0);
+
+        /* Append vertex epilogue before optimisation, so the epilogue itself
+         * is optimised */
+
+        if (ctx->stage == MESA_SHADER_VERTEX)
+                transform_position_writes(nir);
+
+        /* Optimisation passes */
+
+        optimise_nir(nir);
+
+        nir_print_shader(nir, stdout);
+
+        /* Assign counts, now that we're sure (post-optimisation) */
+        program->uniform_count = nir->num_uniforms;
+
+        program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0;
+        program->varying_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_outputs : ((ctx->stage == MESA_SHADER_FRAGMENT) ? nir->num_inputs : 0);
+
+
+        nir_foreach_function(func, nir) {
+                if (!func->impl)
+                        continue;
+
+                list_inithead(&ctx->blocks);
+                ctx->block_count = 0;
+                ctx->func = func;
+
+                emit_cf_list(ctx, &func->impl->body);
+                emit_block(ctx, func->impl->end_block);
+
+                break; /* TODO: Multi-function shaders */
+        }
+
+        util_dynarray_init(compiled, NULL);
+
+        /* Schedule! */
+        schedule_program(ctx);
+
+        /* Now that all the bundles are scheduled and we can calculate block
+         * sizes, emit actual branch instructions rather than placeholders */
+
+        int br_block_idx = 0;
+
+        mir_foreach_block(ctx, block) {
+                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+                        for (int c = 0; c < bundle->instruction_count; ++c) {
+                                midgard_instruction *ins = &bundle->instructions[c];
+
+                                if (ins->unit != ALU_ENAB_BR_COMPACT) continue;
+
+                                if (ins->prepacked_branch) continue;
+
+                                uint16_t compact;
+
+                                /* Determine the block we're jumping to */
+                                int target_number = ins->branch.target_block;
+
+                                midgard_block *target = mir_get_block(ctx, target_number);
+                                assert(target);
+
+                                /* Determine the destination tag */
+                                midgard_bundle *first = util_dynarray_element(&target->bundles, midgard_bundle, 0);
+                                assert(first);
+
+                                int dest_tag = first->tag;
+
+                                /* Count up the number of quadwords we're jumping over. That is, the number of quadwords in each of the blocks between (br_block_idx, target_number) */
+                                int quadword_offset = 0;
+
+                                if (target_number > br_block_idx) {
+                                        /* Jump forward */
+
+                                        for (int idx = br_block_idx + 1; idx < target_number; ++idx) {
+                                                midgard_block *blk = mir_get_block(ctx, idx);
+                                                assert(blk);
+
+                                                quadword_offset += blk->quadword_count;
+                                        }
+                                } else {
+                                        /* Jump backwards */
+
+                                        for (int idx = br_block_idx; idx >= target_number; --idx) {
+                                                midgard_block *blk = mir_get_block(ctx, idx);
+                                                assert(blk);
+
+                                                quadword_offset -= blk->quadword_count;
+                                        }
+                                }
+
+                                if (ins->branch.conditional) {
+                                        midgard_branch_cond branch = {
+                                                .op = midgard_jmp_writeout_op_branch_cond,
+                                                .dest_tag = dest_tag,
+                                                .offset = quadword_offset,
+                                                .cond = ins->branch.invert_conditional ? midgard_condition_false : midgard_condition_true
+                                        };
+
+                                        memcpy(&compact, &branch, sizeof(branch));
+                                } else {
+                                        midgard_branch_uncond branch = {
+                                                .op = midgard_jmp_writeout_op_branch_uncond,
+                                                .dest_tag = dest_tag,
+                                                .offset = quadword_offset,
+                                                .unknown = 1
+                                        };
+
+                                        memcpy(&compact, &branch, sizeof(branch));
+                                }
+
+                                /* Swap in the generic branch for our actual branch */
+                                ins->unit = ALU_ENAB_BR_COMPACT;
+                                ins->br_compact = compact;
+                        }
+
+                }
+
+                ++br_block_idx;
+        }
+
+        /* Emit flat binary from the instruction arrays. Iterate each block in
+         * sequence. Save instruction boundaries such that lookahead tags can
+         * be assigned easily */
+
+        /* Cache _all_ bundles in source order for lookahead across failed branches */
+
+        int bundle_count = 0;
+        mir_foreach_block(ctx, block) {
+                bundle_count += block->bundles.size / sizeof(midgard_bundle);
+        }
+        midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count);
+        int bundle_idx = 0;
+        mir_foreach_block(ctx, block) {
+                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+                        source_order_bundles[bundle_idx++] = bundle;
+                }
+        }
+
+        int current_bundle = 0;
+
+        mir_foreach_block(ctx, block) {
+                util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) {
+                        int lookahead = 1;
+
+                        if (current_bundle + 1 < bundle_count) {
+                                uint8_t next = source_order_bundles[current_bundle + 1]->tag;
+
+                                if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) {
+                                        lookahead = 1;
+                                } else {
+                                        lookahead = next;
+                                }
+                        }
+
+                        emit_binary_bundle(ctx, bundle, compiled, lookahead);
+                        ++current_bundle;
+                }
+
+                /* TODO: Free deeper */
+                //util_dynarray_fini(&block->instructions);
+        }
+
+        free(source_order_bundles);
+
+        /* Due to lookahead, we need to report in the command stream the first
+         * tag executed. An initial block might be empty, so iterate until we
+         * find one that 'works' */
+
+        midgard_block *initial_block = list_first_entry(&ctx->blocks, midgard_block, link);
+
+        program->first_tag = 0;
+
+        do {
+                midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0);
+
+                if (initial_bundle) {
+                        program->first_tag = initial_bundle->tag;
+                        break;
+                }
+
+                /* Initial block is empty, try the next block */
+                initial_block = list_first_entry(&(initial_block->link), midgard_block, link);
+        } while(initial_block != NULL);
+
+        /* Make sure we actually set the tag */
+        assert(program->first_tag);
+
+        /* Deal with off-by-one related to the fencepost problem */
+        program->work_register_count = ctx->work_registers + 1;
+
+        program->can_discard = ctx->can_discard;
+        program->uniform_cutoff = ctx->uniform_cutoff;
+
+        program->blend_patch_offset = ctx->blend_constant_offset;
+
+        disassemble_midgard(program->compiled.data, program->compiled.size);
+
+        return 0;
+}
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_compile.h b/src/gallium/drivers/panfrost/midgard/midgard_compile.h
new file mode 100644
index 0000000000..887fd4e746
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/midgard_compile.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2018 Alyssa Rosenzweig <alyssa at rosenzweig.io>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include "compiler/nir/nir.h"
+#include "util/u_dynarray.h"
+
+/* Define the general compiler entry point */
+
+typedef struct {
+        int work_register_count;
+        int uniform_count;
+        int uniform_cutoff;
+
+        int attribute_count;
+        int varying_count;
+
+        /* Boolean properties of the program */
+        bool can_discard;
+        bool writes_point_size;
+
+        int first_tag;
+
+        struct util_dynarray compiled;
+
+        /* For a blend shader using a constant color -- patch point. If
+         * negative, there's no constant. */
+
+        int blend_patch_offset;
+
+        /* IN: For a fragment shader with a lowered alpha test, the ref value */
+        float alpha_ref;
+} midgard_program;
+
+int
+midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend);
+
+/* NIR options are shared between the standalone compiler and the online
+ * compiler. Defining it here is the simplest, though maybe not the Right
+ * solution. */
+
+static const nir_shader_compiler_options midgard_nir_options = {
+        .lower_ffma = true,
+        .lower_sub = true,
+        .lower_fpow = true,
+        .lower_scmp = true,
+        .lower_flrp32 = true,
+        .lower_flrp64 = true,
+        .lower_ffract = true,
+        .lower_fmod32 = true,
+        .lower_fmod64 = true,
+        .lower_fdiv = true,
+        .lower_idiv = true,
+
+        .vertex_id_zero_based = true,
+        .lower_extract_byte = true,
+        .lower_extract_word = true,
+
+        .native_integers = true
+};
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_nir.h b/src/gallium/drivers/panfrost/midgard/midgard_nir.h
new file mode 100644
index 0000000000..b7a2298050
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/midgard_nir.h
@@ -0,0 +1,5 @@
+#include <stdbool.h>
+#include "nir.h"
+
+bool midgard_nir_lower_algebraic(nir_shader *shader);
+bool midgard_nir_scale_trig(nir_shader *shader);
diff --git a/src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py b/src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py
new file mode 100644
index 0000000000..44441727b7
--- /dev/null
+++ b/src/gallium/drivers/panfrost/midgard/midgard_nir_algebraic.py
@@ -0,0 +1,71 @@
+#
+# Copyright (C) 2018 Alyssa Rosenzweig
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import argparse
+import sys
+import math
+
+a = 'a'
+b = 'b'
+
+algebraic = [
+    (('b2i32', a), ('iand at 32', "a at 32", 1)),
+    (('isign', a), ('imin', ('imax', a, -1), 1)),
+    (('fge', a, b), ('flt', b, a)),
+
+    # XXX: We have hw ops for this, just unknown atm..
+    #(('fsign at 32', a), ('i2f32 at 32', ('isign', ('f2i32 at 32', ('fmul', a, 0x43800000)))))
+    #(('fsign', a), ('fcsel', ('fge', a, 0), 1.0, ('fcsel', ('flt', a, 0.0), -1.0, 0.0)))
+    (('fsign', a), ('bcsel', ('fge', a, 0), 1.0, -1.0)),
+]
+
+# Midgard scales fsin/fcos arguments by pi.
+# Pass must be run only once, after the main loop
+
+scale_trig = [
+        (('fsin', a), ('fsin', ('fdiv', a, math.pi))),
+        (('fcos', a), ('fcos', ('fdiv', a, math.pi))),
+]
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-p', '--import-path', required=True)
+    args = parser.parse_args()
+    sys.path.insert(0, args.import_path)
+    run()
+
+
+def run():
+    import nir_algebraic  # pylint: disable=import-error
+
+    print('#include "midgard_nir.h"')
+    print(nir_algebraic.AlgebraicPass("midgard_nir_lower_algebraic",
+                                      algebraic).render())
+
+    print(nir_algebraic.AlgebraicPass("midgard_nir_scale_trig",
+                                      scale_trig).render())
+
+
+if __name__ == '__main__':
+    main()
-- 
2.20.1