Mesa (main): pan/bi: Annotate Valhall instructions with units

Thu Nov 18 23:34:31 UTC 2021

Module: Mesa
Branch: main
Commit: 855ab23d9af3e3e17b68985b3c8c782f3153712b
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=855ab23d9af3e3e17b68985b3c8c782f3153712b

Author: Alyssa Rosenzweig <alyssa at collabora.com>
Date:   Mon Nov 15 18:18:23 2021 -0500

pan/bi: Annotate Valhall instructions with units

Based on analyzing the cycle counts reported by the Mali offline
compiler.

Signed-off-by: Alyssa Rosenzweig <alyssa at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13802>

---

 src/panfrost/bifrost/valhall/ISA.xml | 188 ++++++++++++++++++-----------------
 1 file changed, 97 insertions(+), 91 deletions(-)

diff --git a/src/panfrost/bifrost/valhall/ISA.xml b/src/panfrost/bifrost/valhall/ISA.xml
index 99fbbdcf9f2..3dcd81fb72b 100644
--- a/src/panfrost/bifrost/valhall/ISA.xml
+++ b/src/panfrost/bifrost/valhall/ISA.xml
@@ -576,7 +576,7 @@
     <value name="0x7C007C00">v2inf</value>
   </enum>
 
-  <ins name="NOP" title="No operation" dests="0" opcode="0x00">
+  <ins name="NOP" title="No operation" dests="0" opcode="0x00" unit="CVT">
     <desc>
       Do nothing. Useful at the start of a block for waiting on slots required
       by the first actual instruction of the block, to reconcile dependencies
@@ -584,7 +584,7 @@
     </desc>
   </ins>
 
-  <ins name="BRANCHZ" title="Compare to zero and branch" dests="0" opcode="0x1F">
+  <ins name="BRANCHZ" title="Compare to zero and branch" dests="0" opcode="0x1F" unit="CVT">
     <desc>
       Branches to a specified relative offset if its source is nonzero (default)
       or if its source is zero (if `.eq` is set). The offset is 27-bits and
@@ -605,7 +605,7 @@
     <mod name="eq" start="36" size="1"/>
   </ins>
 
-  <ins name="DISCARD.f32" title="Discard fragment" opcode="0x20">
+  <ins name="DISCARD.f32" title="Discard fragment" opcode="0x20" unit="CVT">
     <desc>
       Evaluates the given condition, and if it passes, discards the current
       fragment and terminates the thread. The destination should be set to R60.
@@ -617,7 +617,7 @@
     <src absneg="true" swizzle="true">Right value to compare</src>
   </ins>
 
-  <ins name="BRANCHZI" title="Compare to zero and branch indirect" opcode="0x2F">
+  <ins name="BRANCHZI" title="Compare to zero and branch indirect" opcode="0x2F" unit="CVT">
     <desc>
       Jump to an indirectly specified address. Used to jump to blend shaders at
       the end of a fragment shader.
@@ -627,7 +627,7 @@
     <mod name="eq" start="36" size="1"/>
   </ins>
 
-  <ins name="BARRIER" title="Execution and memory barrier" opcode="0x45">
+  <ins name="BARRIER" title="Execution and memory barrier" opcode="0x45" unit="NONE">
     <desc>
       General-purpose barrier. Must use slot #7. Must be paired with a
       `.barrier` action on the instruction.
@@ -635,7 +635,7 @@
     <slot/>
   </ins>
 
-  <group name="CSEL" title="Floating-point conditional select" dests="1">
+  <group name="CSEL" title="Floating-point conditional select" dests="1" unit="CVT">
     <ins name="CSEL.f32" opcode="0x154"/>
     <ins name="CSEL.v2f16" opcode="0x155"/>
     <desc>
@@ -649,7 +649,7 @@
     <src float="true">Return value if false</src>
   </group>
 
-  <group name="CSEL" title="Integer conditional select" dests="1">
+  <group name="CSEL" title="Integer conditional select" dests="1" unit="CVT">
     <ins name="CSEL.u32" opcode="0x150"/>
     <ins name="CSEL.v2u16" opcode="0x151"/>
     <ins name="CSEL.i32" opcode="0x158"/>
@@ -670,7 +670,7 @@
     <src>Return value if false</src>
   </group>
 
-  <ins name="LD_VAR_SPECIAL" title="Load special varying" opcode="0x56">
+  <ins name="LD_VAR_SPECIAL" title="Load special varying" opcode="0x56" unit="V">
     <sr write="true"/>
     <sr_count/>
     <vecsize/>
@@ -680,7 +680,7 @@
     <imm name="index" start="12" size="4"/> <!-- 0 for pointx, 1 for pointy, 2 for fragw, 3 for fragz -->
   </ins>
 
-  <group name="LD_VAR_IMM_F32" title="Load immediate varying">
+  <group name="LD_VAR_IMM_F32" title="Load immediate varying" unit="V">
     <desc>Interpolates a given varying</desc>
     <ins name="LD_VAR_IMM_F32" opcode="0x5C"/>
     <ins name="LD_VAR_IMM_F16" opcode="0x5D"/>
@@ -694,7 +694,7 @@
     <imm name="index" start="20" size="4"/>
   </group>
 
-  <ins name="LD_ATTR_IMM" title="Load immediate attribute" opcode="0x66">
+  <ins name="LD_ATTR_IMM" title="Load immediate attribute" opcode="0x66" unit="LS">
     <sr_count/>
     <vecsize/>
     <regfmt/>
@@ -705,7 +705,7 @@
     <imm name="index" start="20" size="4"/>
   </ins>
 
-  <ins name="LD_ATTR" title="Load indirect attribute" opcode="0x67">
+  <ins name="LD_ATTR" title="Load indirect attribute" opcode="0x67" unit="LS">
     <desc>The index must not diverge within a warp.</desc>
     <vecsize/>
     <regfmt/>
@@ -717,7 +717,7 @@
     <src>Index</src>
   </ins>
 
-  <ins name="LEA_ATTR" title="Load effective address" opcode="0x5E">
+  <ins name="LEA_ATTR" title="Load effective address" opcode="0x5E" unit="LS">
     <desc>
       Loads the effective address of the position buffer (in a position shader)
       or the varying buffer (in a varying shader). That is, the base pointer
@@ -736,7 +736,7 @@
     <src>Linear ID</src>
   </ins>
 
-  <ins name="LOAD.i8" title="Global memory load" opcode="0x60" opcode2="0">
+  <ins name="LOAD.i8" title="Global memory load" opcode="0x60" opcode2="0" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -747,7 +747,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <ins name="LOAD.i16" title="Global memory load" opcode="0x60" opcode2="1">
+  <ins name="LOAD.i16" title="Global memory load" opcode="0x60" opcode2="1" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -758,7 +758,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <ins name="LOAD.i24" title="Global memory load" opcode="0x60" opcode2="2">
+  <ins name="LOAD.i24" title="Global memory load" opcode="0x60" opcode2="2" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -769,7 +769,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <ins name="LOAD.i32" title="Global memory load" opcode="0x60" opcode2="3">
+  <ins name="LOAD.i32" title="Global memory load" opcode="0x60" opcode2="3" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -780,7 +780,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <ins name="LOAD.i48" title="Global memory load" opcode="0x60" opcode2="4">
+  <ins name="LOAD.i48" title="Global memory load" opcode="0x60" opcode2="4" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -791,7 +791,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <ins name="LOAD.i64" title="Global memory load" opcode="0x60" opcode2="5">
+  <ins name="LOAD.i64" title="Global memory load" opcode="0x60" opcode2="5" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -802,7 +802,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <ins name="LOAD.i96" title="Global memory load" opcode="0x60" opcode2="6">
+  <ins name="LOAD.i96" title="Global memory load" opcode="0x60" opcode2="6" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -813,7 +813,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <ins name="LOAD.i128" title="Global memory load" opcode="0x60" opcode2="7">
+  <ins name="LOAD.i128" title="Global memory load" opcode="0x60" opcode2="7" unit="LS">
     <desc>Loads from main memory</desc>
     <sr write="true"/>
     <sr_count/>
@@ -824,7 +824,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </ins>
 
-  <group name="STORE" title="Global memory store" opcode="0x61">
+  <group name="STORE" title="Global memory store" opcode="0x61" unit="LS">
     <desc>Stores to main memory</desc>
     <sr read="true"/>
     <ins name="STORE.i8" opcode2="0x0"/>
@@ -842,7 +842,7 @@
     <imm name="offset" start="8" size="16" signed="true"/>
   </group>
 
-  <ins name="ST_IMAGE" title="Image store" opcode="0x71">
+  <ins name="ST_IMAGE" title="Image store" opcode="0x71" unit="LS">
     <desc>Stores to images</desc>
     <sr read="true"/>
     <sr_count/>
@@ -850,7 +850,7 @@
     <src>Address to store to after adding offset</src>
   </ins>
 
-  <ins name="LD_TILE" title="Load from tilebuffer" opcode="0x78">
+  <ins name="LD_TILE" title="Load from tilebuffer" opcode="0x78" unit="NONE">
     <desc>
       Loads a given render target, specified in the pixel indices descriptor, at
       a given location and sample, and convert to the format specified in the
@@ -865,7 +865,7 @@
     <src>Conversion descriptor</src>
   </ins>
 
-  <ins name="BLEND" title="Blend render target" opcode="0x7F">
+  <ins name="BLEND" title="Blend render target" opcode="0x7F" unit="NONE">
     <desc>
       Blends a given render target. This loads the API-specified blend state for
       the render target from the first source. Blend descriptors are available
@@ -901,7 +901,7 @@
     <regfmt/>
   </ins>
 
-  <ins name="ATEST" title="Alpha test" opcode="0x7D">
+  <ins name="ATEST" title="Alpha test" opcode="0x7D" unit="NONE">
     <desc>
       Does alpha-to-coverage testing, updating the sample coverage mask. ATEST
       does not do an implicit discard. It should be executed before the first
@@ -914,7 +914,7 @@
     <sr_count/>
   </ins>
 
-  <ins name="ZS_EMIT" title="Depth/stencil write" opcode="0x7E">
+  <ins name="ZS_EMIT" title="Depth/stencil write" opcode="0x7E" unit="NONE">
     <desc>
       Programatically writes out depth, stencil, or both, depending on which
       modifiers are set. Used to implement gl_FragDepth and gl_FragStencil.
@@ -927,7 +927,7 @@
     <src>Input coverage mask</src>
   </ins>
 
-  <group name="CONVERT" title="Data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="Data conversions" dests="1" opcode="0x90" unit="CVT">
     <desc>
       Performs the given data conversion. Note that floating-point rounding is
       handled via the same hardware and therefore shares an encoding. Round mode
@@ -950,7 +950,7 @@
     <src widen="true">Value to convert</src>
   </group>
 
-  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unit="CVT">
     <desc>Performs the given data conversion.</desc>
     <ins name="F32_TO_S32" opcode2="0xC"/>
     <ins name="F32_TO_U32" opcode2="0x1C"/>
@@ -958,7 +958,7 @@
     <src absneg="true">Value to convert</src>
   </group>
 
-  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90" unit="CVT">
     <desc>Performs the given data conversion.</desc>
     <ins name="V2F16_TO_V2S16" opcode2="0xE"/>
     <ins name="V2F16_TO_V2U16" opcode2="0x1E"/>
@@ -968,13 +968,13 @@
     <src swizzle="true" absneg="true" size="16">Value to convert</src>
   </group>
 
-  <ins name="F16_TO_F32" title="16-bit float to 32-bit float conversion" dests="1" opcode="0x90" opcode2="0xB">
+  <ins name="F16_TO_F32" title="16-bit float to 32-bit float conversion" dests="1" opcode="0x90" opcode2="0xB" unit="CVT">
     <desc>Converts up with the specified round mode.</desc>
     <roundmode/>
     <src lane="28" size="16" absneg="true">Value to convert</src>
   </ins>
 
-  <group name="CONVERT" title="8-bit data conversions" dests="1" opcode="0x90">
+  <group name="CONVERT" title="8-bit data conversions" dests="1" opcode="0x90" unit="CVT">
     <desc>
       Performs the given data conversion.
     </desc>
@@ -992,7 +992,7 @@
     <src lane="28" size="8">Value to convert</src>
   </group>
 
-  <group name="FROUND" title="Floating-point rounding" dests="1" opcode="0x90">
+  <group name="FROUND" title="Floating-point rounding" dests="1" opcode="0x90" unit="CVT">
     <desc>
       Performs the given rounding, using the convert unit.
     </desc>
@@ -1004,33 +1004,33 @@
     <src swizzle="true" absneg="true">Value to convert</src>
   </group>
 
-  <ins name="MOV.i32" title="Register move" dests="1" opcode="0x91" opcode2="0x0">
+  <ins name="MOV.i32" title="Register move" dests="1" opcode="0x91" opcode2="0x0" unit="CVT">
     <desc>Canonical register-to-register move.</desc>
     <src/>
   </ins>
 
-  <ins name="CLZ.u32" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x4">
+  <ins name="CLZ.u32" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x4" unit="CVT">
     <desc>
       Used as a primitive for various bitwise operations.
     </desc>
     <src/>
   </ins>
 
-  <ins name="CLZ.v2u16" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x5">
+  <ins name="CLZ.v2u16" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x5" unit="CVT">
     <desc>
       Used as a primitive for various bitwise operations.
     </desc>
     <src/>
   </ins>
 
-  <ins name="CLZ.v4u8" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x6">
+  <ins name="CLZ.v4u8" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x6" unit="CVT">
     <desc>
       Used as a primitive for various bitwise operations.
     </desc>
     <src/>
   </ins>
 
-  <ins name="IABS.s32" title="Absolute value" dests="1" opcode="0x91" opcode2="0x8">
+  <ins name="IABS.s32" title="Absolute value" dests="1" opcode="0x91" opcode2="0x8" unit="CVT">
     <desc>
       64-bit abs may be constructed in 4 instructions (5 clocks) by checking the
       sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with
@@ -1039,15 +1039,15 @@
     <src widen="true"/>
   </ins>
 
-  <ins name="IABS.v2s16" title="Absolute value" dests="1" opcode="0x91" opcode2="0x9">
+  <ins name="IABS.v2s16" title="Absolute value" dests="1" opcode="0x91" opcode2="0x9" unit="CVT">
     <src widen="true"/>
   </ins>
 
-  <ins name="IABS.v4s8" title="Absolute value" dests="1" opcode="0x91" opcode2="0xa">
+  <ins name="IABS.v4s8" title="Absolute value" dests="1" opcode="0x91" opcode2="0xa" unit="CVT">
     <src/>
   </ins>
 
-  <ins name="POPCOUNT.i32" title="Population count" dests="1" opcode="0x91" opcode2="0xC">
+  <ins name="POPCOUNT.i32" title="Population count" dests="1" opcode="0x91" opcode2="0xC" unit="SFU">
     <desc>
       Only available as 32-bit. Smaller bitsizes require explicit conversions.
       64-bit popcount may be constructed in 3 clocks by separate 32-bit
@@ -1057,28 +1057,29 @@
     <src/>
   </ins>
 
-  <ins name="BITREV.i32" title="Bitwise reverse" dests="1" opcode="0x91" opcode2="0xD">
+  <ins name="BITREV.i32" title="Bitwise reverse" dests="1" opcode="0x91" opcode2="0xD" unit="SFU">
     <desc>
       Only available as 32-bit. Other bitsizes may be derived with swizzles.
     </desc>
     <src/>
   </ins>
 
-  <ins name="NOT.i32" title="Bitwise complement" dests="1" opcode="0x91" opcode2="0xE">
+  <ins name="NOT.i32" title="Bitwise complement" dests="1" opcode="0x91" opcode2="0xE" unit="SFU">
     <desc>
       For fully featured bitwise operation, see the shift opcodes.
     </desc>
     <src/>
   </ins>
 
-  <ins name="NOT.i64" title="Bitwise complement" dests="1" opcode="0x191" opcode2="0xE">
+  <ins name="NOT.i64" title="Bitwise complement" dests="1" opcode="0x191" opcode2="0xE" unit="SFU">
     <desc>
       For fully featured bitwise operation, see the shift opcodes.
     </desc>
     <src/>
   </ins>
 
-  <ins name="WMASK" title="Warp mask" dests="1" opcode="0x95">
+  <ins name="WMASK" title="Warp mask" dests="1" opcode="0x95" unit="SFU">
+    <!-- TODO: confirm unit -->
     <desc>
       Returns the mask of lanes ever active within the warp (subgroup), such
       that the source is nonzero. The number of work-items in a subgroup is
@@ -1094,7 +1095,7 @@
     <subgroup/>
   </ins>
 
-  <group name="FREXP" title="Fraction/exponent extract" dests="1" opcode="0x99">
+  <group name="FREXP" title="Fraction/exponent extract" dests="1" opcode="0x99" unit="CVT">
     <ins name="FREXPM.f32" opcode2="0"/>
     <ins name="FREXPM.v2f16" opcode2="1"/>
     <ins name="FREXPE.f32" opcode2="2"/>
@@ -1109,7 +1110,7 @@
     <src float="true" swizzle="true"/>
   </group>
 
-  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C">
+  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unit="SFU">
     <ins name="FRCP.f32" opcode2="0"/>
     <ins name="FRCP.f16" opcode2="1"/>
     <ins name="FRSQ.f32" opcode2="2"/>
@@ -1121,10 +1122,10 @@
       The logarithm instruction (`FLOGD.f32`) requires an argument reduction. See the
       transcendentals section for more information.
     </desc>
-    <src float="true" swizzle="true"/>
+    <src float="true" swizzle="true" absneg="true"/>
   </group>
 
-  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C">
+  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C" unit="SFU">
     <ins name="FSIN_TABLE.u6" opcode2="4"/>
     <ins name="FCOS_TABLE.u6" opcode2="5"/>
     <desc>
@@ -1134,7 +1135,7 @@
     <src/>
   </group>
 
-  <group name="FADD" title="Floating-point add" dests="1" opcode2="0">
+  <group name="FADD" title="Floating-point add" dests="1" opcode2="0" unit="FMA">
     <ins name="FADD.f32" opcode="0xA4"/>
     <ins name="FADD.v2f16" opcode="0xA5"/>
     <desc>$A + B$</desc>
@@ -1143,7 +1144,7 @@
     <src absneg="true" swizzle="true">B</src>
   </group>
 
-  <group name="FMIN" title="Floating-point minimum" dests="1" opcode2="2">
+  <group name="FMIN" title="Floating-point minimum" dests="1" opcode2="2" unit="CVT">
     <ins name="FMIN.f32" opcode="0xA4"/>
     <ins name="FMIN.v2f16" opcode="0xA5"/>
     <desc>$\min \{ A, B \}$</desc>
@@ -1152,7 +1153,7 @@
     <src absneg="true" swizzle="true">B</src>
   </group>
 
-  <group name="FMAX" title="Floating-point maximum" dests="1" opcode2="3">
+  <group name="FMAX" title="Floating-point maximum" dests="1" opcode2="3" unit="CVT">
     <ins name="FMAX.f32" opcode="0xA4"/>
     <ins name="FMAX.v2f16" opcode="0xA5"/>
     <desc>$\max \{ A, B \}$</desc>
@@ -1161,7 +1162,7 @@
     <src absneg="true" swizzle="true">B</src>
   </group>
 
-  <group name="V2F32_TO_V2F16" title="Vectorized floating-point conversion" dests="1" opcode2="4">
+  <group name="V2F32_TO_V2F16" title="Vectorized floating-point conversion" dests="1" opcode2="4" unit="CVT">
     <ins name="V2F32_TO_V2F16" opcode="0xA5"/>
     <desc>
       Given a pair of 32-bit floats, output a pair of 16-bit floats packed into
@@ -1171,7 +1172,7 @@
     <src>B</src>
   </group>
 
-  <group name="FRSCALE" title="Floating-point rescaling" dests="1" opcode2="6">
+  <group name="FRSCALE" title="Floating-point rescaling" dests="1" opcode2="6" unit="FMA">
     <ins name="FRSCALE.f32" opcode="0xA4"/>
     <ins name="FRSCALE.v2f16" opcode="0xA5"/>
     <desc>
@@ -1185,7 +1186,7 @@
     <src absneg="true" swizzle="true">B</src>
   </group>
 
-  <ins name="FEXP.f32" title="Floating-point exponent" dests="1" opcode="0xA4" opcode2="8">
+  <ins name="FEXP.f32" title="Floating-point exponent" dests="1" opcode="0xA4" opcode2="8" unit="SFU">
     <desc>
       Calculates the base-2 exponent of an argument specified as a 8:24
       fixed-point. The original argument is passed as well for correct handling
@@ -1196,7 +1197,7 @@
     <src absneg="true">Input as 32-bit float</src>
   </ins>
 
-  <ins name="FADD_LSCALE.f32" title="Floating-point add with logarithm scale" dests="1" opcode="0xA4" opcode2="9">
+  <ins name="FADD_LSCALE.f32" title="Floating-point add with logarithm scale" dests="1" opcode="0xA4" opcode2="9" unit="FMA">
     <desc>
       Performs a floating-point addition specialized for logarithm computation.
     </desc>
@@ -1205,7 +1206,7 @@
     <src absneg="true">B</src>
   </ins>
 
-  <group name="IADD" title="Integer addition" dests="1" opcode2="0">
+  <group name="IADD" title="Integer addition" dests="1" opcode2="0" unit="CVT">
     <desc>
       $A + B$ with optional saturation.
 
@@ -1226,13 +1227,13 @@
     <saturate/>
   </group>
 
-  <ins name="MKVEC.v2i16" title="Make 16-bit vector" dests="1" opcode="0xA1" opcode2="0x5">
+  <ins name="MKVEC.v2i16" title="Make 16-bit vector" dests="1" opcode="0xA1" opcode2="0x5" unit="CVT">
     <desc>Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)`</desc>
     <src widen="true">A</src>
     <src widen="true">B</src>
   </ins>
 
-  <group name="ISUB" title="Integer subtract" dests="1" opcode2="1">
+  <group name="ISUB" title="Integer subtract" dests="1" opcode2="1" unit="CVT">
     <ins name="ISUB.u32" opcode="0xA0"/>
     <ins name="ISUB.v2u16" opcode="0xA1"/>
     <ins name="ISUB.v4u8" opcode="0xA2"/>
@@ -1247,7 +1248,7 @@
     <saturate/>
   </group>
 
-  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" opcode2="7">
+  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" opcode2="7" unit="CVT">
     <desc>
       Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
       64-bit value A. These instructions accelerate address arithmetic, but may
@@ -1260,7 +1261,7 @@
     <src widen="true">B</src>
   </group>
 
-  <group name="IMUL" title="Integer multiply" dests="1" opcode2="0x0A">
+  <group name="IMUL" title="Integer multiply" dests="1" opcode2="0x0A" unit="SFU">
     <ins name="IMUL.i32" opcode="0xA0"/>
     <ins name="IMUL.v2i16" opcode="0xA1"/>
     <ins name="IMUL.v4i8" opcode="0xA2"/>
@@ -1281,7 +1282,8 @@
     <saturate/>
   </group>
 
-  <group name="HADD" title="Integer half-add" dests="1" opcode2="0x0B">
+  <group name="HADD" title="Integer half-add" dests="1" opcode2="0x0B" unit="CVT">
+    <!-- TODO: confirm unit -->
     <ins name="HADD.u32" opcode="0xA0"/>
     <ins name="HADD.v2u16" opcode="0xA1"/>
     <ins name="HADD.v4u8" opcode="0xA2"/>
@@ -1298,7 +1300,7 @@
     </desc>
   </group>
 
-  <group name="CLPER" title="Cross-lane permute" dests="1" opcode2="0xF">
+  <group name="CLPER" title="Cross-lane permute" dests="1" opcode2="0xF" unit="SFU">
     <ins name="CLPER.i32" opcode="0xA0"/>
     <ins name="CLPER.v2u16" opcode="0xA1"/>
     <ins name="CLPER.v4u8" opcode="0xA2"/>
@@ -1320,7 +1322,7 @@
     <inactive_result/>
   </group>
 
-  <group name="FMA" title="Fused floating-point multiply add" dests="1">
+  <group name="FMA" title="Fused floating-point multiply add" dests="1" unit="FMA">
     <ins name="FMA.f32" opcode="0xB2"/>
     <ins name="FMA.v2f16" opcode="0xB3"/>
     <desc>$A \cdot B + C$</desc>
@@ -1330,7 +1332,7 @@
     <src absneg="true" swizzle="true">C</src>
   </group>
 
-  <group name="LSHIFT_AND" title="Left shift and bitwise AND" dests="1" opcode2="0x100">
+  <group name="LSHIFT_AND" title="Left shift and bitwise AND" dests="1" opcode2="0x100" unit="SFU">
     <ins name="LSHIFT_AND.i32" opcode="0xB4"/>
     <ins name="LSHIFT_AND.v2i16" opcode="0xB5"/>
     <ins name="LSHIFT_AND.v4i8" opcode="0xB6"/>
@@ -1346,7 +1348,7 @@
     <src not="true">B</src>
   </group>
 
-  <group name="RSHIFT_AND" title="Right shift and bitwise AND" dests="1" opcode2="0x000">
+  <group name="RSHIFT_AND" title="Right shift and bitwise AND" dests="1" opcode2="0x000" unit="SFU">
     <ins name="RSHIFT_AND.i32" opcode="0xB4"/>
     <ins name="RSHIFT_AND.v2i16" opcode="0xB5"/>
     <ins name="RSHIFT_AND.v4i8" opcode="0xB6"/>
@@ -1362,7 +1364,7 @@
     <src not="true">B</src>
   </group>
 
-  <group name="LSHIFT_OR" title="Left shift and bitwise OR" dests="1" opcode2="0x101">
+  <group name="LSHIFT_OR" title="Left shift and bitwise OR" dests="1" opcode2="0x101" unit="SFU">
     <ins name="LSHIFT_OR.i32" opcode="0xB4"/>
     <ins name="LSHIFT_OR.v2i16" opcode="0xB5"/>
     <ins name="LSHIFT_OR.v4i8" opcode="0xB6"/>
@@ -1378,7 +1380,7 @@
     <src not="true">B</src>
   </group>
 
-  <group name="RSHIFT_OR" title="Right shift and bitwise OR" dests="1" opcode2="0x001">
+  <group name="RSHIFT_OR" title="Right shift and bitwise OR" dests="1" opcode2="0x001" unit="SFU">
     <ins name="RSHIFT_OR.i32" opcode="0xB4"/>
     <ins name="RSHIFT_OR.v2i16" opcode="0xB5"/>
     <ins name="RSHIFT_OR.v4i8" opcode="0xB6"/>
@@ -1394,7 +1396,7 @@
     <src not="true">B</src>
   </group>
 
-  <group name="LSHIFT_XOR" title="Left shift and bitwise XOR" dests="1" opcode2="0x102">
+  <group name="LSHIFT_XOR" title="Left shift and bitwise XOR" dests="1" opcode2="0x102" unit="SFU">
     <ins name="LSHIFT_XOR.i32" opcode="0xB4"/>
     <ins name="LSHIFT_XOR.v2i16" opcode="0xB5"/>
     <ins name="LSHIFT_XOR.v4i8" opcode="0xB6"/>
@@ -1410,7 +1412,7 @@
     <src not="true">B</src>
   </group>
 
-  <group name="RSHIFT_XOR" title="Right shift and bitwise XOR" dests="1" opcode2="0x002">
+  <group name="RSHIFT_XOR" title="Right shift and bitwise XOR" dests="1" opcode2="0x002" unit="SFU">
     <ins name="RSHIFT_XOR.i32" opcode="0xB4"/>
     <ins name="RSHIFT_XOR.v2i16" opcode="0xB5"/>
     <ins name="RSHIFT_XOR.v4i8" opcode="0xB6"/>
@@ -1426,7 +1428,7 @@
     <src not="true">B</src>
   </group>
 
-  <ins name="MUX.i32" title="Mux" dests="1" opcode="0xB8">
+  <ins name="MUX.i32" title="Mux" dests="1" opcode="0xB8" unit="SFU">
     <desc>
       Mux between A and B based on the provided mask. Equivalent to
       `bitselect()` in OpenCL. `(A & mask) | (A & ~mask)`
@@ -1436,21 +1438,21 @@
     <src>Mask</src>
   </ins>
 
-  <ins name="CUBE_SSEL" title="Cube S-coordinate select" dests="1" opcode="0xBC" opcode2="0">
+  <ins name="CUBE_SSEL" title="Cube S-coordinate select" dests="1" opcode="0xBC" opcode2="0" unit="SFU">
     <desc>During a cube map transform, select the S coordinate given a selected face.</desc>
     <src absneg="true">Z coordinate as 32-bit floating point</src>
     <src absneg="true">X coordinate as 32-bit floating point</src>
     <src>Cube face index</src>
   </ins>
 
-  <ins name="CUBE_TSEL" title="Cube T-coordinate select" dests="1" opcode="0xBC" opcode2="1">
+  <ins name="CUBE_TSEL" title="Cube T-coordinate select" dests="1" opcode="0xBC" opcode2="1" unit="SFU">
     <desc>During a cube map transform, select the T coordinate given a selected face.</desc>
     <src absneg="true">Y coordinate as 32-bit floating point</src>
     <src absneg="true">Z coordinate as 32-bit floating point</src>
     <src>Cube face index</src>
   </ins>
 
-  <ins name="MKVEC.v4i8" title="Make 8-bit vector" dests="1" opcode="0xBD">
+  <ins name="MKVEC.v4i8" title="Make 8-bit vector" dests="1" opcode="0xBD" unit="CVT">
     <desc>
       Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD.
 
@@ -1465,21 +1467,22 @@
     <src>CD</src>
   </ins>
 
-  <ins name="CUBEFACE1" title="Cube map transform step 1" dests="1" opcode="0xC0">
+  <ins name="CUBEFACE1" title="Cube map transform step 1" dests="1" opcode="0xC0" unit="SFU">
     <desc>Select the maximum absolute value of its arguments.</desc>
     <src absneg="true">X coordinate as 32-bit floating point</src>
     <src absneg="true">Y coordinate as 32-bit floating point</src>
     <src absneg="true">Z coordinate as 32-bit floating point</src>
   </ins>
 
-  <ins name="CUBEFACE2" title="Cube map transform step 2" dests="1" opcode="0xC1">
+  <ins name="CUBEFACE2" title="Cube map transform step 2" dests="1" opcode="0xC1" unit="SFU">
     <desc>Select the cube face index corresponding to the arguments.</desc>
     <src absneg="true">X coordinate as 32-bit floating point</src>
     <src absneg="true">Y coordinate as 32-bit floating point</src>
     <src absneg="true">Z coordinate as 32-bit floating point</src>
   </ins>
 
-  <group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2">
+  <group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2" unit="SFU">
+    <!-- TODO: confirm unit -->
     <desc>
       8-bit integer dot product between 4 channel vectors, intended for machine
       learning. Available in both unsigned and signed variants, controlling
@@ -1500,7 +1503,7 @@
     <saturate/>
   </group>
 
-  <group name="ICMP" title="Unsigned integer compare" dests="1">
+  <group name="ICMP" title="Unsigned integer compare" dests="1" unit="CVT">
     <desc>
       Evaluates the given condition, do a logical and/or with the condition in
       the result source, and return in the given result type (integer
@@ -1528,7 +1531,7 @@
     <src>C</src>
   </group>
 
-  <group name="FCMP" title="Floating-point compare" dests="1">
+  <group name="FCMP" title="Floating-point compare" dests="1" unit="CVT">
     <desc>
       Evaluates the given condition, do a logical and/or with the condition in
       the result source, and return in the given result type (integer
@@ -1547,7 +1550,7 @@
     <src>C</src>
   </group>
 
-  <group name="ICMP" title="Signed integer compare" dests="1">
+  <group name="ICMP" title="Signed integer compare" dests="1" unit="CVT">
     <desc>
       Evaluates the given condition, do a logical and/or with the condition in
       the result source, and return in the given result type (integer
@@ -1575,7 +1578,7 @@
     <src>C</src>
   </group>
 
-  <ins name="IADD_IMM.i32" title="Integer addition with immediate" dests="1" opcode="0x110">
+  <ins name="IADD_IMM.i32" title="Integer addition with immediate" dests="1" opcode="0x110" unit="CVT">
     <desc>
       Adds an arbitrary 32-bit immediate embedded within the instruction stream.
       If no modifiers are required, this is preferred to `IADD.i32` with a
@@ -1588,7 +1591,7 @@
     <imm name="constant" start="8" size="32"/>
   </ins>
 
-  <ins name="IADD_IMM.v2i16" title="Integer addition with immediate" dests="1" opcode="0x111">
+  <ins name="IADD_IMM.v2i16" title="Integer addition with immediate" dests="1" opcode="0x111" unit="CVT">
     <desc>
       Adds an arbitrary pair of 16-bit immediates embedded within the
       instruction stream. If no modifiers are required, this is preferred to
@@ -1600,7 +1603,7 @@
     <imm name="constant" start="8" size="32"/>
   </ins>
 
-  <ins name="IADD_IMM.v4i8" title="Integer addition with immediate" dests="1" opcode="0x112">
+  <ins name="IADD_IMM.v4i8" title="Integer addition with immediate" dests="1" opcode="0x112" unit="CVT">
     <desc>
       Adds an arbitrary quad of 8-bit immediates embedded within the
       instruction stream. If no modifiers are required, this is preferred to
@@ -1612,7 +1615,7 @@
     <imm name="constant" start="8" size="32"/>
   </ins>
 
-  <ins name="FADD_IMM.f32" title="Floating-point addition with immediate" dests="1" opcode="0x114">
+  <ins name="FADD_IMM.f32" title="Floating-point addition with immediate" dests="1" opcode="0x114" unit="FMA">
     <desc>
       Adds an arbitrary 32-bit immediate embedded within the instruction stream.
       If no modifiers are required, this is preferred to `FADD.f32` with a
@@ -1623,7 +1626,7 @@
     <imm name="constant" start="8" size="32"/>
   </ins>
 
-  <ins name="FADD_IMM.v2f16" title="Floating-point addition with immediate" dests="1" opcode="0x115">
+  <ins name="FADD_IMM.v2f16" title="Floating-point addition with immediate" dests="1" opcode="0x115" unit="FMA">
     <desc>
       Adds an arbitrary pair of 16-bit immediates embedded within the
       instruction stream. If no modifiers are required, this is preferred to
@@ -1635,7 +1638,7 @@
     <imm name="constant" start="8" size="32"/>
   </ins>
 
-  <ins name="TODO.ATOM_C1" title="Atomic operations on memory with 1" opcode="0x69">
+  <ins name="TODO.ATOM_C1" title="Atomic operations on memory with 1" opcode="0x69" unit="LS">
     <!-- TODO -->
     <mod name="i32" start="17" size="1"/>
     <mod name="unk" start="23" size="1"/>
@@ -1646,7 +1649,7 @@
     <slot/>
   </ins>
 
-  <ins name="TODO.ATOM_C" title="Atomic operations on memory" opcode="0x120">
+  <ins name="TODO.ATOM_C" title="Atomic operations on memory" opcode="0x120" unit="LS">
     <!-- TODO -->
     <mod name="i32" start="17" size="1"/>
     <mod name="unk" start="23" size="1"/>
@@ -1657,7 +1660,7 @@
     <slot/>
   </ins>
 
-  <ins name="TEX_FETCH" title="Texel fetch" opcode="0x125">
+  <ins name="TEX_FETCH" title="Texel fetch" opcode="0x125" unit="T">
     <desc>Unfiltered textured instruction.</desc>
     <sr read="true"/>
     <sr write="true" count="4"/>
@@ -1669,7 +1672,7 @@
     <src>Image to read from</src>
   </ins>
 
-  <ins name="TEX" title="Texture load" opcode="0x128">
+  <ins name="TEX" title="Texture load" opcode="0x128" unit="T">
     <desc>Ordinary texturing instruction using a sampler.</desc>
     <sr read="true"/>
     <sr write="true" count="4"/>
@@ -1683,8 +1686,11 @@
     <slot/>
   </ins>
 
-  <ins name="TODO.VAR_TEX" title="Fused varying-texturing" opcode="0x130">
-    <desc>Only works for FP32 varyings.</desc>
+  <ins name="TODO.VAR_TEX" title="Fused varying-texturing" opcode="0x130" unit="VT">
+    <desc>
+      Only works for FP32 varyings. Performance characteristics are similar
+      to LD_VAR_IMM_F32.v2.f32 followed by TEX, using both V and T units.
+    </desc>
     <sr write="true" count="4"/>
     <mod name="dimension" start="28" size="2"/>
     <mod name="skip" start="39" size="1"/>
@@ -1692,7 +1698,7 @@
     <src>Image to read from</src>
   </ins>
 
-  <ins name="FMA_RSCALE.f32" title="Fused floating-point multiply add with exponent bias" dests="1" opcode="0x160">
+  <ins name="FMA_RSCALE.f32" title="Fused floating-point multiply add with exponent bias" dests="1" opcode="0x160" unit="FMA">
     <desc>
       First calculates $A \cdot B + C$ and then biases the exponent by D. Used in
       special transcendental function sequences. It should not be used for