From 316486dd9f6bbd03e7e13655674f1fa91e533b9a Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@collabora.com>
Date: Fri, 16 Jul 2021 10:40:58 -0400
Subject: [PATCH] pan/va: Add initial ISA.xml for Valhall

This handwritten file is the product of over a hundred hours of
reverse-engineering and represents the sum of what I've learned about
the Valhall architecture. It will be used in the next commits as the
backbone of a Valhall toolchain.

Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12025>
---
 src/panfrost/bifrost/valhall/ISA.xml | 1683 ++++++++++++++++++++++++++++++++++
 1 file changed, 1683 insertions(+)
 create mode 100644 src/panfrost/bifrost/valhall/ISA.xml
diff --git a/src/panfrost/bifrost/valhall/ISA.xml b/src/panfrost/bifrost/valhall/ISA.xml
new file mode 100644
index 0000000..902d1a0
--- /dev/null
+++ b/src/panfrost/bifrost/valhall/ISA.xml
@@ -0,0 +1,1683 @@
+<!--
+  Copyright (C) 2021 Collabora Ltd.
+
+  Permission is hereby granted, free of charge, to any person obtaining a
+  copy of this software and associated documentation files (the "Software"),
+  to deal in the Software without restriction, including without limitation
+  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+  and/or sell copies of the Software, and to permit persons to whom the
+  Software is furnished to do so, subject to the following conditions:
+
+  The above copyright notice and this permission notice (including the next
+  paragraph) shall be included in all copies or substantial portions of the
+  Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+  THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+  SOFTWARE.
+-->
+
+<valhall>
+  <lut name="Immediates">
+    <desc>
+      This immediates are accessible in (almost) any instruction, provided the
+      immediate mode is kept to the default. They optimize for the most common
+      immediate values; any immediate listed here may be used without taking up
+      a uniform slot or a register. Most integer instructions can access
+      separate half-words and individual bytes via swizzles on the source.
+    </desc>
+    <constant desc="Zero">0x00000000</constant>
+    <constant desc="All ones; integer $-1$">0xFFFFFFFF</constant>
+    <constant desc="Maximum integer; floating-point NaN">0x7FFFFFFF</constant>
+    <constant desc="Integers $(-2, -3, -4, -5)$">0xFAFCFDFE</constant>
+    <constant desc="16-bit integer $2^8$">0x01000000</constant>
+    <constant desc="Multiples of 16 $(0, 32, 0, 128)$">0x80002000</constant>
+    <constant desc="Multiples of 16 $(48, 80, 96, 112)$">0x70605030</constant>
+    <constant desc="Multiples of 16 $(144, 160, 176, 192)$">0xC0B0A090</constant>
+    <constant desc="Integers $(0, 1, 2, 3)$">0x03020100</constant>
+    <constant desc="Integers $(4, 5, 6, 7)$">0x07060504</constant>
+    <constant desc="Integers $(8, 9, 10, 11)$">0x0B0A0908</constant>
+    <constant desc="Integers $(12, 13, 14, 15)$">0x0F0E0D0C</constant>
+    <constant desc="Integers $(16, 17, 18, 19)$">0x13121110</constant>
+    <constant desc="Integers $(20, 21, 22, 23)$">0x17161514</constant>
+    <constant desc="Integers $(24, 25, 26, 27)$">0x1B1A1918</constant>
+    <constant desc="Integers $(28, 29, 30, 31)$">0x1F1E1D1C</constant>
+    <constant desc="Float $1.0$">0x3F800000</constant>
+    <constant desc="Float $0.1$">0x3DCCCCCD</constant>
+    <constant desc="Float $1 / \pi$">0x3EA2F983</constant>
+    <constant desc="Float $\log(2)$">0x3F317218</constant>
+    <constant desc="Float $\pi$">0x40490FDB</constant>
+    <constant desc="Float $0.0$">0x00000000</constant>
+    <constant desc="Float $65535.0 = 2^{16} - 1$">0x477FFF00</constant>
+    <constant desc="Half-float $(255.0, 256.0) = (2^8 - 1, 2^8)$">0x5C005BF8</constant>
+    <constant desc="Half-float $0.1 = 1 / 10$">0x2E660000</constant>
+    <constant desc="Half-float $0.25 = 2^{-2}$">0x34000000</constant>
+    <constant desc="Half-float $0.5 = 2^{-1}$">0x38000000</constant>
+    <constant desc="Half-float $1.0 = 2^0$">0x3C000000</constant>
+    <constant desc="Half-float $2.0 = 2^1$">0x40000000</constant>
+    <constant desc="Half-float $4.0 = 2^2$">0x44000000</constant>
+    <constant desc="Half-float $8.0 = 2^3$">0x48000000</constant>
+    <constant desc="Half-float $\pi$">0x42480000</constant>
+  </lut>
+
+  <enum name="Action">
+    <desc>
+      Every Valhall instruction can perform an action, like wait on dependency
+      slots. A few special actions are available, specified in the instruction
+      metadata from this enum. The `wait0126` action is required to wait on
+      dependency slot #6 and should be set on the instruction immediately
+      preceding `ATEST`. The `barrier` action may be set on any instruction for
+      subgroup barriers, and should particularly be set with the `BARRIER`
+      instruction for global barriers. The `td` action only applies to fragment
+      shaders and is used to terminate helper invocations, it should be set as
+      early as possible after helper invocations are no longer needed as
+      determined by data flow analysis. The `return` action is used to terminate
+      the shader, although it may be overloaded by the `BLEND` instruction.
+
+      The `reconverge` action is required on any instruction immediately
+      preceding a possible change to the mask of active threads in a subgroup.
+      This includes all divergent branches, but it also includes the final
+      instruction at the end of any basic block where the immediate successor
+      (fallthrough) is the target of a divergent branch.
+    </desc>
+    <value name="Wait on all dependency slots">wait0126</value>
+    <value name="Subgroup barrier">barrier</value>
+    <value name="Perform branch reconverge">reconverge</value>
+    <reserved/>
+    <reserved/>
+    <value name="Terminate discarded threads">td</value>
+    <reserved/>
+    <value name="Return from shader">return</value>
+  </enum>
+
+  <enum name="Immediate mode">
+    <desc>Selects how immediates sources are interpreted.</desc>
+    <value desc="No special immediates" default="true">none</value>
+    <value desc="Thread storage pointers">ts</value>
+    <reserved/>
+    <value desc="Thread identification">id</value>
+  </enum>
+
+  <enum name="Thread storage pointers">
+    <desc>
+      Situated between the immediates hard-coded in the hardware and the
+      uniforms defined purely in software, Valhall has a some special
+      "constants" passing through data structures. These are encoded like the
+      table of immediates, as if special constant $i$ were lookup table entry
+      $32 + i$. These special values are selected with the `.ts` modifier.
+    </desc>
+    <reserved/>
+    <reserved/>
+    <value desc="Thread local storage base pointer (low word)">tls_ptr</value>
+    <value desc="Thread local storage base pointer (high word)">tls_ptr_hi</value>
+    <reserved/>
+    <reserved/>
+    <value desc="Workgroup local storage base pointer (low word)">wls_ptr</value>
+    <value desc="Workgroup local storage base pointer (high word)">wls_ptr_hi</value>
+  </enum>
+
+  <enum name="Thread identification">
+    <desc>
+      Situated between the immediates hard-coded in the hardware and the
+      uniforms defined purely in software, Valhall has a some special
+      "constants" passing through data structures. These are encoded like the
+      table of immediates, as if special constant $i$ were lookup table entry
+      $32 + i$. These special values are selected with the `.id` modifier.
+    </desc>
+    <reserved/>
+    <reserved/>
+    <value desc="Lane ID">lane_id</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value desc="Core ID">core_id</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value desc="Program counter">program_counter</value>
+    <reserved/>
+  </enum>
+
+  <enum name="Swizzles (8-bit)">
+    <value default="true">b0123</value>
+    <value>b3210</value>
+    <value>b0101</value>
+    <value>b2323</value>
+    <value>b0000</value>
+    <value>b1111</value>
+    <value>b2222</value>
+    <value>b3333</value>
+    <value>b2301</value>
+    <value>b1032</value>
+    <value>b0011</value>
+    <value>b2233</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Lanes (8-bit)">
+    <desc>Used to select the 2 bytes for shifts of 16-bit vectors</desc>
+    <value>b02</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value>b00</value>
+    <value>b11</value>
+    <value>b22</value>
+    <value>b33</value>
+    <reserved/>
+    <reserved/>
+    <value>b01</value>
+    <value>b23</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Swizzles (16-bit)">
+    <value>h00</value> <!-- 0,2 -->
+    <value>h10</value>
+    <value default="true">h01</value>
+    <value>h11</value>
+    <value>b00</value> <!-- 0,0 -->
+    <value>b20</value> <!-- 1,1 -->
+    <value>b02</value> <!-- 2,2 -->
+    <value>b22</value> <!-- 3,3 -->
+    <value>b11</value>
+    <value>b31</value>
+    <value>b13</value> <!-- 0,1 -->
+    <value>b33</value> <!-- 2,3 -->
+    <value>b01</value>
+    <value>b23</value>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Swizzles (32-bit)">
+    <value default="true">none</value>
+    <reserved/>
+    <value>h0</value>
+    <value>h1</value>
+    <value>b0</value>
+    <value>b1</value>
+    <value>b2</value>
+    <value>b3</value>
+  </enum>
+
+  <enum name="Swizzles (64-bit)">
+    <value default="true">none</value>
+    <reserved/>
+    <value>h0</value>
+    <value>h1</value>
+    <value>b0</value>
+    <value>b1</value>
+    <value>b2</value>
+    <value>b3</value>
+    <value>w0</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Lane (8-bit)" implied="true">
+    <value>b0</value>
+    <value>b1</value>
+    <value>b2</value>
+    <value>b3</value>
+  </enum>
+
+  <enum name="Lane (32-bit)">
+    <desc>
+      Used for the lane select of `BRANCHZ`. To use an 8-bit condition, a
+      separate `ICMP` is required to cast to 16-bit.
+    </desc>
+    <value default="true">none</value>
+    <value>h0</value>
+    <value>h1</value>
+    <reserved/>
+  </enum>
+
+  <enum name="Lane (16-bit)" implied="true">
+    <value>h0</value>
+    <value>h1</value>
+  </enum>
+
+  <enum name="Load lane (8-bit)">
+    <value default="true">b0</value>
+    <value>b1</value>
+    <value>b2</value>
+    <value>b3</value>
+    <value desc="Zero-extend to 16-bit, low-half">h0</value>
+    <value desc="Zero-extend to 16-bit, high-half">h1</value>
+    <value desc="Zero-extend to 32-bit">w0</value>
+    <value desc="Zero-extend to 32-bit">d0</value>
+  </enum>
+
+  <enum name="Load lane (16-bit)">
+    <value desc="Low half" default="true">h0</value>
+    <value desc="High half">h1</value>
+    <value desc="Zero-extend to 32-bit">w0</value>
+    <value desc="Zero-extend to 64-bit">d0</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Load lane (24-bit)" implied="true">
+    <value default="true">identity</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Load lane (32-bit)">
+    <value default="true">w0</value>
+    <value desc="Zero-extend to 64-bit">d0</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Load lane (48-bit)">
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value default="true">identity</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Load lane (64-bit)">
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value default="true">identity</value>
+  </enum>
+
+  <enum name="Load lane (96-bit)">
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value default="true">identity</value>
+    <reserved/>
+  </enum>
+
+  <enum name="Load lane (128-bit)">
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value default="true">identity</value>
+  </enum>
+
+  <enum name="Round mode">
+    <desc>Corresponds to IEEE 754 rounding modes</desc>
+    <value desc="Round to nearest even" default="true">rte</value>
+    <value desc="Round to positive infinity">rtp</value>
+    <value desc="Round to negative infinity">rtn</value>
+    <value desc="Round to zero">rtz</value>
+  </enum>
+
+  <enum name="Result type">
+    <desc>
+      Comparison instructions like `FCMP` return a boolean but may encode this
+      boolean in a variety of ways. `i1` gives a OpenGL style `0/1` boolean.
+      `m1` gives a Direct3D style `0/~0` boolean. `f1` gives a floating-point
+      `0.0f / 1.0f` boolean. Switching between these modes is useful to fold a
+      boolean type convert into a comparison. `u1` is used internally to
+      implement 64-bit comparisons.
+    </desc>
+    <value desc="Integer 1">i1</value>
+    <value desc="Float 1">f1</value>
+    <value desc="Minus 1">m1</value>
+    <value desc="Low half of 64-bit compare">u1</value>
+  </enum>
+
+  <enum name="Widen">
+    <value default="true">none</value>
+    <value>h0</value>
+    <value>h1</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Clamp">
+    <desc>
+      Clamp applied to the destination of a floating-point instruction. Note the
+      clamps may be decomposed as two independent bits for `clamp_0_inf` and
+      `clamp_m1_1`, with `clamp_0_1` arising as the composition of `clamp_0_inf`
+      and `clamp_m1_1` in either order.
+    </desc>
+    <value default="true" desc="Identity">none</value>
+    <value desc="Clamp positive">clamp_0_inf</value>
+    <value desc="Clamp to $[-1, 1]$">clamp_m1_1</value>
+    <value desc="Clamp to $[0, 1]$">clamp_0_1</value>
+  </enum>
+
+  <enum name="Condition">
+    <desc>
+      Condition code. Type must be inferred from the instruction. IEEE 754 total
+      ordering only applies to floating point compares. "Not equal" and "greater
+      than or less than" are distinguished by NaN behaviour conforming to
+      the IEEE 754 specification.
+    </desc>
+    <value desc="Equal">eq</value>
+    <value desc="Greater than">gt</value>
+    <value desc="Greater than or equal">ge</value>
+    <value desc="Not equal">ne</value>
+    <value desc="Less than">lt</value>
+    <value desc="Less than or equal">le</value>
+    <value desc="Greater than or less than">gtlt</value>
+    <value desc="Totally ordered">total</value>
+  </enum>
+
+  <enum name="Dimension">
+    <desc>Texture dimension.</desc>
+    <value desc="1D or buffer">1d</value>
+    <value desc="2D or 2D array">2d</value>
+    <value desc="3D or 3D array">3d</value>
+    <value desc="Cube map or cube map array">cube</value>
+  </enum>
+
+  <enum name="LOD mode">
+    <desc>Level-of-detail selection mode in a texture instruction.</desc>
+    <value desc="Set to zero">zero</value>
+    <value desc="Computed based on neighboring fragments">computed</value>
+    <reserved/>
+    <reserved/>
+    <value desc="Explicitly specified in a register">explicit</value>
+    <value desc="Computed based on neighboring fragments added with bias in a register">computed_bias</value>
+    <value desc="Derived from a gradient descriptor in registers">grdesc</value>
+    <reserved/>
+  </enum>
+
+  <enum name="Register format">
+    <desc>Format of data loaded to / stored from registers for general memory access.</desc>
+    <reserved/>
+    <reserved/>
+    <value desc="32-bit floats">f32</value>
+    <value desc="16-bit floats">f16</value>
+    <value desc="32-bit unsigned integers">u32</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+  </enum>
+
+  <enum name="Staging register count" implied="true">
+    <value>sr0</value>
+    <value>sr1</value>
+    <value>sr2</value>
+    <value>sr3</value>
+    <value>sr4</value>
+    <value>sr5</value>
+    <value>sr6</value>
+    <value>sr7</value>
+  </enum>
+
+  <enum name="Vector size">
+    <desc>Number of channels loaded/stored for general memory access.</desc>
+    <value default="true" desc="Scalar">none</value>
+    <value desc="2 channels">v2</value>
+    <value desc="3 channels">v3</value>
+    <value desc="4 channels">v4</value>
+  </enum>
+
+  <enum name="Slot">
+    <desc>
+      Dependency slot set on a message-passing instruction that writes to
+      registers. Before reading the destination, a future instruction must wait
+      on the specified slot. Slot #7 is for `BARRIER` instructions only.
+    </desc>
+    <value desc="Slot #0">slot0</value>
+    <value desc="Slot #1">slot1</value>
+    <value desc="Slot #2">slot2</value>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <reserved/>
+    <value desc="Slot #7">slot7</value>
+  </enum>
+
+  <enum name="Store segment">
+    <desc>Memory segment written to by a `STORE` instruction.</desc>
+    <value desc="Global or workgroup local memory" default="none">global</value>
+    <value desc="Position output (in a position shader)">pos</value>
+    <value desc="Varyings with LEA_ATTR computed addresses">vary</value>
+    <value desc="Thread local storage">tl</value>
+  </enum>
+
+  <enum name="Subgroup size">
+    <desc>
+      Selects the effective subgroup size from subgroup operations. The hardware
+      warps are sixteen threads on Valhall, but subdividing a warp may be useful
+      for API requirements. In particular, derivatives may be calculated with
+      quads (four threads).
+    </desc>
+    <value desc="Two threads">subgroup2</value>
+    <value desc="Four threads">subgroup4</value>
+    <value desc="Eight threads">subgroup8</value>
+    <value desc="Sixteen threads" default="true">subgroup16</value>
+  </enum>
+
+  <enum name="Lane operation">
+    <desc>
+      Acts as a modifier on the lane specificier for a `CLPER` instruction. The
+      `accumulate` mode is required for efficient subgroup reductions.
+    </desc>
+    <value name="No operation" default="true">none</value>
+    <value name="Exclusive-or">xor</value>
+    <value name="Accumulate">accumulate</value>
+    <value name="Shift">shift</value>
+  </enum>
+
+  <enum name="Inactive result">
+    <desc>
+      Accesses to inactive lanes (due to divergence) in a subgroup is generally
+      undefined in APIs. However, the results of permuting with an inactive lane
+      with `CLPER.i32` are well-defined in Valhall: they return one of the
+      following values, as specified in the `CLPER.i32` instructions. Sometimes
+      certain values enable small optimizations.
+    </desc>
+    <value name="0x00000000" default="true">zero</value>
+    <value name="0xFFFFFFFF">umax</value>
+    <value name="0x00000001">i1</value>
+    <value name="0x00010001">v2i1</value>
+    <value name="0x80000000">smin</value>
+    <value name="0x7FFFFFFF">smax</value>
+    <value name="0x80008000">v2smin</value>
+    <value name="0x7FFF7FFF">v2smax</value>
+    <value name="0x80808080">v4smin</value>
+    <value name="0x7F7F7F7F">v4smax</value>
+    <value name="0x3F800000">f1</value>
+    <value name="0x3C003C00">v2f1</value>
+    <value name="0xFF800000">infn</value>
+    <value name="0x7F800000">inf</value>
+    <value name="0xFC00FC00">v2infn</value>
+    <value name="0x7C007C00">v2inf</value>
+  </enum>
+
+  <ins name="NOP" title="No operation" dests="0" opcode="0x00">
+    <desc>
+      Do nothing. Useful at the start of a block for waiting on slots required
+      by the first actual instruction of the block, to reconcile dependencies
+      after a branch. Also useful as the sole instruction of an empty shader.
+    </desc>
+  </ins>
+
+  <ins name="BRANCHZ" title="Compare to zero and branch" dests="0" opcode="0x1F">
+    <desc>
+      Branches to a specified relative offset if its source is nonzero (default)
+      or if its source is zero (if `.eq` is set). The offset is 27-bits and
+      sign-extended, giving an effective range of Â±26-bits. The offset is
+      specified in units of instructions, relative to the *next* instruction.
+      Positive offsets may be interpreted as "number of instructions to skip".
+      Since Valhall instructions are 8 bytes, this operates as:
+
+      $$PC := \begin{cases} PC + 8 \cdot (\text{offset} \; + 1) &amp; \text{if} \;
+      \text{src} \stackrel{?}{=} 0 \\ PC + 8 &amp; \text{otherwise} \end{cases}$$
+
+      Used with comparison instructions to implement control flow. Tie the
+      source to a nonzero constant to implement a jump. May introduce
+      divergence, so generally requires `.reconverge` flow control.
+    </desc>
+    <src lane="37">Value to compare against zero</src>
+    <imm name="offset" start="8" size="27" signed="true"/>
+    <mod name="eq" start="36" size="1"/>
+  </ins>
+
+  <ins name="DISCARD.f32" title="Discard fragment" opcode="0x20">
+    <desc>
+      Evaluates the given condition, and if it passes, discards the current
+      fragment and terminates the thread. The destination should be set to R60.
+      Only valid in a **fragment** shader.
+    </desc>
+    <cmp/>
+    <dest>Updated coverage mask (set to R60)</dest>
+    <src absneg="true" swizzle="true">Left value to compare</src>
+    <src absneg="true" swizzle="true">Right value to compare</src>
+  </ins>
+
+  <ins name="BRANCHZI" title="Compare to zero and branch indirect" opcode="0x2F">
+    <desc>
+      Jump to an indirectly specified address. Used to jump to blend shaders at
+      the end of a fragment shader.
+    </desc>
+    <src>Value to compare against zero</src>
+    <src>Branch target</src>
+    <mod name="eq" start="36" size="1"/>
+  </ins>
+
+  <ins name="BARRIER" title="Execution and memory barrier" opcode="0x45">
+    <desc>
+      General-purpose barrier. Must use slot #7. Must be paired with a
+      `.barrier` action on the instruction.
+    </desc>
+    <slot/>
+  </ins>
+
+  <group name="CSEL" title="Floating-point conditional select" dests="1">
+    <ins name="CSEL.f32" opcode="0x154"/>
+    <ins name="CSEL.v2f16" opcode="0x155"/>
+    <desc>
+      Evaluates the given condition and outputs either the true source or the
+      false source.
+    </desc>
+    <cmp/>
+    <src float="true">Left value to compare</src>
+    <src float="true">Right value to compare</src>
+    <src float="true">Return value if true</src>
+    <src float="true">Return value if false</src>
+  </group>
+
+  <group name="CSEL" title="Integer conditional select" dests="1">
+    <ins name="CSEL.u32" opcode="0x150"/>
+    <ins name="CSEL.v2u16" opcode="0x151"/>
+    <ins name="CSEL.i32" opcode="0x158"/>
+    <ins name="CSEL.v2i16" opcode="0x159"/>
+    <desc>
+      Evaluates the given condition and outputs either the true source or the
+      false source.
+
+      Valhall lacks integer minimum/maximum instructions. `CSEL` instructions
+      with tied operands form the canonical implementations of these
+      instructions. Similarly, the integer $\text{sign}$ function is canonically
+      implemented with a pair of `CSEL` instructions.
+    </desc>
+    <cmp/>
+    <src>Left value to compare</src>
+    <src>Right value to compare</src>
+    <src>Return value if true</src>
+    <src>Return value if false</src>
+  </group>
+
+  <ins name="LD_VAR_SPECIAL" title="Load special varying" opcode="0x56">
+    <sr write="true"/>
+    <sr_count/>
+    <vecsize/>
+    <regfmt/>
+    <slot/>
+    <src/>
+    <imm name="index" start="12" size="4"/> <!-- 0 for pointx, 1 for pointy, 2 for fragw, 3 for fragz -->
+  </ins>
+
+  <group name="LD_VAR_IMM_F32" title="Load immediate varying">
+    <desc>Interpolates a given varying</desc>
+    <ins name="LD_VAR_IMM_F32" opcode="0x5C"/>
+    <ins name="LD_VAR_IMM_F16" opcode="0x5D"/>
+    <sr write="true"/>
+    <vecsize/>
+    <sr_count/>
+    <regfmt/>
+    <slot/>
+    <src/>
+    <src/>
+    <imm name="index" start="20" size="4"/>
+  </group>
+
+  <ins name="LD_ATTR_IMM" title="Load immediate attribute" opcode="0x66">
+    <sr_count/>
+    <vecsize/>
+    <regfmt/>
+    <slot/>
+    <sr write="true"/>
+    <src>Vertex ID</src>
+    <src>Instance ID</src>
+    <imm name="index" start="20" size="4"/>
+  </ins>
+
+  <ins name="LD_ATTR" title="Load indirect attribute" opcode="0x67">
+    <desc>The index must not diverge within a warp.</desc>
+    <vecsize/>
+    <regfmt/>
+    <slot/>
+    <sr_count/>
+    <sr write="true"/>
+    <src>Vertex ID</src>
+    <src>Instance ID</src>
+    <src>Index</src>
+  </ins>
+
+  <ins name="LEA_ATTR" title="Load effective address" opcode="0x5E">
+    <desc>
+      Loads the effective address of the position buffer (in a position shader)
+      or the varying buffer (in a varying shader). That is, the base pointer
+      plus the vertex's linear ID (the first source) times the buffer's
+      per-vertex stride. `LEA_ATTR` should be executed once in a
+      position/varying shader, with the linear ID preloaded as `r59`. Each
+      position/varying store can then be constructed as `STORE` with the base
+      address sourced from the 64-bit destination of `LEA_ATTR` and an
+      appropriately computed offset. Varying stores bypass the usual conversion
+      hardware for attributes; this diverges from earlier Mali hardware.
+    </desc>
+    <sr write="true"/>
+    <sr_count/>
+    <slot/>
+    <imm name="unk" start="8" size="4"/>
+    <src>Linear ID</src>
+  </ins>
+
+  <ins name="LOAD.i8" title="Global memory load" opcode="0x60" opcode2="0">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_8_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <ins name="LOAD.i16" title="Global memory load" opcode="0x60" opcode2="1">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_16_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <ins name="LOAD.i24" title="Global memory load" opcode="0x60" opcode2="2">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_24_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <ins name="LOAD.i32" title="Global memory load" opcode="0x60" opcode2="3">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_32_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <ins name="LOAD.i48" title="Global memory load" opcode="0x60" opcode2="4">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_48_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <ins name="LOAD.i64" title="Global memory load" opcode="0x60" opcode2="5">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_64_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <ins name="LOAD.i96" title="Global memory load" opcode="0x60" opcode2="6">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_96_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <ins name="LOAD.i128" title="Global memory load" opcode="0x60" opcode2="7">
+    <desc>Loads from main memory</desc>
+    <sr write="true"/>
+    <sr_count/>
+    <mod name="load_lane_128_bit" start="36" size="3"/>
+    <mod name="unsigned" start="39" size="1"/>
+    <slot/>
+    <src>Address to load from after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </ins>
+
+  <group name="STORE" title="Global memory store" opcode="0x61">
+    <desc>Stores to main memory</desc>
+    <sr read="true"/>
+    <ins name="STORE.i8" opcode2="0x0"/>
+    <ins name="STORE.i16" opcode2="0x1"/>
+    <ins name="STORE.i24" opcode2="0x2"/>
+    <ins name="STORE.i32" opcode2="0x3"/>
+    <ins name="STORE.i48" opcode2="0x4"/>
+    <ins name="STORE.i64" opcode2="0x5"/>
+    <ins name="STORE.i96" opcode2="0x6"/>
+    <ins name="STORE.i128" opcode2="0x7"/>
+    <sr_count/>
+    <store_segment/>
+    <slot/>
+    <src>Address to store to after adding offset</src>
+    <imm name="offset" start="8" size="16" signed="true"/>
+  </group>
+
+  <ins name="ST_IMAGE" title="Image store" opcode="0x71">
+    <desc>Stores to images</desc>
+    <sr read="true"/>
+    <sr_count/>
+    <slot/>
+    <src>Address to store to after adding offset</src>
+  </ins>
+
+  <ins name="LD_TILE" title="Load from tilebuffer" opcode="0x78">
+    <desc>
+      Loads a given render target, specified in the pixel indices descriptor, at
+      a given location and sample, and convert to the format specified in the
+      internal conversion descriptor. Used to implement EXT_framebuffer_fetch
+      and internally in blend shaders.
+    </desc>
+    <sr write="true"/>
+    <sr_count/>
+    <slot/>
+    <src>Pixel indices descriptor</src>
+    <src>Coverage mask</src>
+    <src>Conversion descriptor</src>
+  </ins>
+
+  <ins name="BLEND" title="Blend render target" opcode="0x7F">
+    <desc>
+      Blends a given render target. This loads the API-specified blend state for
+      the render target from the first source. Blend descriptors are available
+      as special immediates. It then reads the colour to be blended from the
+      first staging register, with the specified vector size and register format
+      as desired. The resulting coverage mask is stored to the second set of
+      staging registers.
+
+      In the fixed-function path, `BLEND` sends the colour to the blender to be
+      written to the tilebuffer. Then, if the instruction's flow control
+      specifies termination, the fragment program is ended. If it does not
+      specify termination, `BLEND` acts as a relative branch, branching with the
+      offset specified as `target`. This allows the subsequent instructions to
+      be skipped when fixed-function blending is used. Note this implicit branch
+      can never introduce divergence, so `.reconverge` is not required.
+
+      In the blend shader path, `BLEND` ignores the specified flow control and
+      does not branch to the specified offset. Instead, execution considers
+      normally with the next instruction. The compiler should insert code for
+      calling a blend shader after the `BLEND` instruction unless it is known
+      that a blend shader will never be required.
+
+      The indirection is required to support both fixed-function and blend
+      shaders efficiently and without shader variants.
+    </desc>
+    <sr read="true"/>
+    <sr write="true" count="1" flags="false"/>
+    <src>Blend descriptor</src>
+    <imm name="target" start="8" size="8"/>
+    <slot/>
+    <sr_count/>
+    <vecsize/>
+    <regfmt/>
+  </ins>
+
+  <ins name="ATEST" title="Alpha test" opcode="0x7D">
+    <desc>
+      Does alpha-to-coverage testing, updating the sample coverage mask. ATEST
+      does not do an implicit discard. It should be executed before the first
+      ZS_EMIT or BLEND instruction.
+    </desc>
+    <sr write="true">Updated coverage mask</sr>
+    <src>Input coverage mask</src>
+    <src>Alpha value (render target 0)</src>
+    <src/>
+    <sr_count/>
+  </ins>
+
+  <ins name="ZS_EMIT" title="Depth/stencil write" opcode="0x7E">
+    <desc>
+      Programatically writes out depth, stencil, or both, depending on which
+      modifiers are set. Used to implement gl_FragDepth and gl_FragStencil.
+    </desc>
+    <mod name="z" start="25" size="1"/>
+    <mod name="stencil" start="24" size="1"/>
+    <dest>Updated coverage mask</dest>
+    <src>Depth value</src>
+    <src>Stencil value</src>
+    <src>Input coverage mask</src>
+  </ins>
+
+  <group name="CONVERT" title="Data conversions" dests="1" opcode="0x90">
+    <desc>
+      Performs the given data conversion. Note that floating-point rounding is
+      handled via the same hardware and therefore shares an encoding. Round mode
+      is specified where it makes sense.
+    </desc>
+
+    <ins name="S16_TO_S32" opcode2="0x4"/>
+    <ins name="S16_TO_F32" opcode2="0x5"/>
+    <ins name="V2S16_TO_V2F16" opcode2="0x7"/>
+
+    <ins name="S32_TO_F32" opcode2="0x9"/>
+
+    <ins name="U16_TO_U32" opcode2="0x14"/>
+    <ins name="U16_TO_F32" opcode2="0x15"/>
+    <ins name="V2U16_TO_V2F16" opcode2="0x17"/>
+
+    <ins name="U32_TO_F32" opcode2="0x19"/>
+
+    <roundmode/>
+    <src widen="true">Value to convert</src>
+  </group>
+
+  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90">
+    <desc>Performs the given data conversion.</desc>
+    <ins name="F32_TO_S32" opcode2="0xC"/>
+    <ins name="F32_TO_U32" opcode2="0x1C"/>
+    <roundmode/>
+    <src absneg="true">Value to convert</src>
+  </group>
+
+  <group name="CONVERT" title="Float-to-int data conversions" dests="1" opcode="0x90">
+    <desc>Performs the given data conversion.</desc>
+    <ins name="V2F16_TO_V2S16" opcode2="0xE"/>
+    <ins name="V2F16_TO_V2U16" opcode2="0x1E"/>
+    <ins name="F16_TO_S32" opcode2="0xA"/>
+    <ins name="F16_TO_U32" opcode2="0x1A"/>
+    <roundmode/>
+    <src swizzle="true" absneg="true" size="16">Value to convert</src>
+  </group>
+
+  <ins name="F16_TO_F32" title="16-bit float to 32-bit float conversion" dests="1" opcode="0x90" opcode2="0xB">
+    <desc>Converts up with the specified round mode.</desc>
+    <roundmode/>
+    <src lane="28" size="16" absneg="true">Value to convert</src>
+  </ins>
+
+  <group name="CONVERT" title="8-bit data conversions" dests="1" opcode="0x90">
+    <desc>
+      Performs the given data conversion.
+    </desc>
+
+    <ins name="S8_TO_S32" opcode2="0x0"/>
+    <ins name="S8_TO_F32" opcode2="0x1"/>
+    <ins name="S8_TO_S16" opcode2="0x2"/>
+    <ins name="S8_TO_F16" opcode2="0x3"/>
+
+    <ins name="U8_TO_U32" opcode2="0x10"/>
+    <ins name="U8_TO_F32" opcode2="0x11"/>
+    <ins name="U8_TO_U16" opcode2="0x12"/>
+    <ins name="U8_TO_F16" opcode2="0x13"/>
+
+    <src lane="28" size="8">Value to convert</src>
+  </group>
+
+  <group name="FROUND" title="Floating-point rounding" dests="1" opcode="0x90">
+    <desc>
+      Performs the given rounding, using the convert unit.
+    </desc>
+
+    <ins name="FROUND.f32" opcode2="0xD"/>
+    <ins name="FROUND.v2f16" opcode2="0xF"/>
+
+    <roundmode/>
+    <src swizzle="true" absneg="true">Value to convert</src>
+  </group>
+
+  <ins name="MOV.i32" title="Register move" dests="1" opcode="0x91" opcode2="0x0">
+    <desc>Canonical register-to-register move.</desc>
+    <src/>
+  </ins>
+
+  <ins name="CLZ.u32" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x4">
+    <desc>
+      Used as a primitive for various bitwise operations.
+    </desc>
+    <src/>
+  </ins>
+
+  <ins name="CLZ.v2u16" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x5">
+    <desc>
+      Used as a primitive for various bitwise operations.
+    </desc>
+    <src/>
+  </ins>
+
+  <ins name="CLZ.v4u8" title="Count leading zeroes" dests="1" opcode="0x91" opcode2="0x6">
+    <desc>
+      Used as a primitive for various bitwise operations.
+    </desc>
+    <src/>
+  </ins>
+
+  <ins name="IABS.s32" title="Absolute value" dests="1" opcode="0x91" opcode2="0x8">
+    <desc>
+      64-bit abs may be constructed in 4 instructions (5 clocks) by checking the
+      sign with `ICMP.s32.lt.m1 hi, 0` and negating based on the result with
+      `IADD.s64` and `LSHIFT_XOR.i32` on each half.
+    </desc>
+    <src widen="true"/>
+  </ins>
+
+  <ins name="IABS.v2s16" title="Absolute value" dests="1" opcode="0x91" opcode2="0x9">
+    <src widen="true"/>
+  </ins>
+
+  <ins name="IABS.v4s8" title="Absolute value" dests="1" opcode="0x91" opcode2="0xa">
+    <src/>
+  </ins>
+
+  <ins name="POPCOUNT.i32" title="Population count" dests="1" opcode="0x91" opcode2="0xC">
+    <desc>
+      Only available as 32-bit. Smaller bitsizes require explicit conversions.
+      64-bit popcount may be constructed in 3 clocks by separate 32-bit
+      popcounts of each half and a 32-bit add, which is guaranteed not to
+      overflow.
+    </desc>
+    <src/>
+  </ins>
+
+  <ins name="BITREV.i32" title="Bitwise reverse" dests="1" opcode="0x91" opcode2="0xD">
+    <desc>
+      Only available as 32-bit. Other bitsizes may be derived with swizzles.
+    </desc>
+    <src/>
+  </ins>
+
+  <ins name="NOT.i32" title="Bitwise complement" dests="1" opcode="0x91" opcode2="0xE">
+    <desc>
+      For fully featured bitwise operation, see the shift opcodes.
+    </desc>
+    <src/>
+  </ins>
+
+  <ins name="NOT.i64" title="Bitwise complement" dests="1" opcode="0x191" opcode2="0xE">
+    <desc>
+      For fully featured bitwise operation, see the shift opcodes.
+    </desc>
+    <src/>
+  </ins>
+
+  <ins name="WMASK" title="Warp mask" dests="1" opcode="0x95">
+    <desc>
+      Returns the mask of lanes ever active within the warp (subgroup), such
+      that the source is nonzero. The number of work-items in a subgroup is
+      given as the popcount of this value with a nonzero input.
+
+      An `all()` subgroup operation may be constructed as `WMASK` of the input
+      compared for equality with `WMASK` of an nonzero value.
+
+      An `any()` subgroup operation may be constructed as `WMASK` of the input
+      compared against zero.
+    </desc>
+    <src/>
+    <subgroup/>
+  </ins>
+
+  <group name="FREXP" title="Fraction/exponent extract" dests="1" opcode="0x99">
+    <ins name="FREXPM.f32" opcode2="0"/>
+    <ins name="FREXPM.v2f16" opcode2="1"/>
+    <ins name="FREXPE.f32" opcode2="2"/>
+    <ins name="FREXPE.v2f16" opcode2="3"/>
+    <desc>
+      Breaks up the floating-point input into its fractional (mantissa) and
+      exponent parts. By default, this is compatible with the `frexp()` function
+      in APIs. With the log modifier, the floating point format is adjusted to
+      be compatible with Valhall's argument reduction for logarithm computation.
+    </desc>
+    <mod name="log" start="25" size="1"/>
+    <src float="true" swizzle="true"/>
+  </group>
+
+  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C">
+    <ins name="FRCP.f32" opcode2="0"/>
+    <ins name="FRCP.f16" opcode2="1"/>
+    <ins name="FRSQ.f32" opcode2="2"/>
+    <ins name="FRSQ.f16" opcode2="3"/>
+    <ins name="FLOGD.f32" opcode2="8"/>
+    <desc>
+      Performs a given special function. The floating-point reciprocal (`FRCP`)
+      and reciprocal square root (`FRSQ`) instructions may be freely used as-is.
+      The logarithm instruction (`FLOGD.f32`) requires an argument reduction. See the
+      transcendentals section for more information.
+    </desc>
+    <src float="true" swizzle="true"/>
+  </group>
+
+  <group name="SFU" title="Special function unit" dests="1" opcode="0x9C">
+    <ins name="FSIN_TABLE.u6" opcode2="4"/>
+    <ins name="FCOS_TABLE.u6" opcode2="5"/>
+    <desc>
+      Performs a given special function.The trigonometric tables (`FSIN_TABLE.u6` and `FCOS_TABLE.u6`) are crude,
+      requiring both an argument reduction and postprocessing.
+    </desc>
+    <src/>
+  </group>
+
+  <group name="FADD" title="Floating-point add" dests="1" opcode2="0">
+    <ins name="FADD.f32" opcode="0xA4"/>
+    <ins name="FADD.v2f16" opcode="0xA5"/>
+    <desc>$A + B$</desc>
+    <clamp/>
+    <src absneg="true" swizzle="true">A</src>
+    <src absneg="true" swizzle="true">B</src>
+  </group>
+
+  <group name="FMIN" title="Floating-point minimum" dests="1" opcode2="2">
+    <ins name="FMIN.f32" opcode="0xA4"/>
+    <ins name="FMIN.v2f16" opcode="0xA5"/>
+    <desc>$\min \{ A, B \}$</desc>
+    <clamp/>
+    <src absneg="true" swizzle="true">A</src>
+    <src absneg="true" swizzle="true">B</src>
+  </group>
+
+  <group name="FMAX" title="Floating-point maximum" dests="1" opcode2="3">
+    <ins name="FMAX.f32" opcode="0xA4"/>
+    <ins name="FMAX.v2f16" opcode="0xA5"/>
+    <desc>$\max \{ A, B \}$</desc>
+    <clamp/>
+    <src absneg="true" swizzle="true">A</src>
+    <src absneg="true" swizzle="true">B</src>
+  </group>
+
+  <group name="V2F32_TO_V2F16" title="Vectorized floating-point conversion" dests="1" opcode2="4">
+    <ins name="V2F32_TO_V2F16" opcode="0xA5"/>
+    <desc>
+      Given a pair of 32-bit floats, output a pair of 16-bit floats packed into
+      a 32-bit destination.
+    </desc>
+    <src>A</src>
+    <src>B</src>
+  </group>
+
+  <group name="FRSCALE" title="Floating-point rescaling" dests="1" opcode2="6">
+    <ins name="FRSCALE.f32" opcode="0xA4"/>
+    <ins name="FRSCALE.v2f16" opcode="0xA5"/>
+    <desc>
+      Computes $A \cdot 2^B$ by adding B to the exponent of A. Used to calculate
+      various special functions, particularly base-2 exponents. Special case
+      handling differs from an actual floating-point multiply, so this should
+      not be used outside fixed instruction sequences.
+    </desc>
+    <clamp/>
+    <src absneg="true" swizzle="true">A</src>
+    <src absneg="true" swizzle="true">B</src>
+  </group>
+
+  <ins name="FEXP.f32" title="Floating-point exponent" dests="1" opcode="0xA4" opcode2="8">
+    <desc>
+      Calculates the base-2 exponent of an argument specified as a 8:24
+      fixed-point. The original argument is passed as well for correct handling
+      of special cases.
+    </desc>
+    <clamp/>
+    <src>Input as 8:24 fixed-point</src>
+    <src absneg="true">Input as 32-bit float</src>
+  </ins>
+
+  <ins name="FADD_LSCALE.f32" title="Floating-point add with logarithm scale" dests="1" opcode="0xA4" opcode2="9">
+    <desc>
+      Performs a floating-point addition specialized for logarithm computation.
+    </desc>
+    <clamp/>
+    <src absneg="true">A</src>
+    <src absneg="true">B</src>
+  </ins>
+
+  <group name="IADD" title="Integer addition" dests="1" opcode2="0">
+    <desc>
+      $A + B$ with optional saturation.
+
+      As Valhall lacks swizzle instructions, `IADD.v2i16` with zero is the
+      canonical lowering for swizzles.
+    </desc>
+    <ins name="IADD.u32" opcode="0xA0"/>
+    <ins name="IADD.v2u16" opcode="0xA1"/>
+    <ins name="IADD.v4u8" opcode="0xA2"/>
+    <ins name="IADD.s32" opcode="0xA8"/>
+    <ins name="IADD.v2s16" opcode="0xA9"/>
+    <ins name="IADD.v4s8" opcode="0x1A2"/>
+    <ins name="IADD.u64" opcode="0x1A3"/>
+    <ins name="IADD.s64" opcode="0x1AB"/>
+    <!-- <ins name="IADD.s32" opcode="0x1A0"/> -->
+    <src widen="true">A</src>
+    <src widen="true">B</src>
+    <saturate/>
+  </group>
+
+  <ins name="MKVEC.v2i16" title="Make 16-bit vector" dests="1" opcode="0xA1" opcode2="0x5">
+    <desc>Calculates $A | (B \ll 16)$. Used to implement `(ushort2)(A, B)`</desc>
+    <src widen="true">A</src>
+    <src widen="true">B</src>
+  </ins>
+
+  <group name="ISUB" title="Integer subtract" dests="1" opcode2="1">
+    <ins name="ISUB.u32" opcode="0xA0"/>
+    <ins name="ISUB.v2u16" opcode="0xA1"/>
+    <ins name="ISUB.v4u8" opcode="0xA2"/>
+    <ins name="ISUB.s32" opcode="0xA8"/>
+    <ins name="ISUB.v2s16" opcode="0xA9"/>
+    <ins name="ISUB.v4s8" opcode="0x1A2"/>
+    <ins name="ISUB.u64" opcode="0x1A3"/>
+    <ins name="ISUB.s64" opcode="0x1AB"/>
+    <desc>$A - B$ with optional saturation</desc>
+    <src widen="true">A</src>
+    <src widen="true">B</src>
+    <saturate/>
+  </group>
+
+  <group name="SHADDX" title="Shift, extend, and 64-bit add" dests="1" opcode2="7">
+    <desc>
+      Sign or zero extend B to 64-bits, left-shift by `shift`, and add the
+      64-bit value A. These instructions accelerate address arithmetic, but may
+      be used in full generality for 64-bit integer arithmetic.
+    </desc>
+    <ins name="SHADDX.u64" opcode="0x1A3"/>
+    <ins name="SHADDX.s64" opcode="0x1AB"/>
+    <imm name="shift" start="20" size="3"/>
+    <src>A</src>
+    <src widen="true">B</src>
+  </group>
+
+  <group name="IMUL" title="Integer multiply" dests="1" opcode2="0x0A">
+    <ins name="IMUL.i32" opcode="0xA0"/>
+    <ins name="IMUL.v2i16" opcode="0xA1"/>
+    <ins name="IMUL.v4i8" opcode="0xA2"/>
+    <ins name="IMUL.s32" opcode="0xA8"/>
+    <ins name="IMUL.v2s16" opcode="0xA9"/>
+    <ins name="IMUL.v4s8" opcode="0x1A2"/>
+    <ins name="IMULD.u64" opcode="0x1A3"/>
+    <!-- <ins name="IMUL.s32" opcode="0x1A0"/> -->
+    <desc>
+      $A \cdot B$ with optional saturation. Note the multipliers can only handle up to
+      32-bit by 32-bit multiplies. The 64-bit "multiply" acts like IMUL.u32 but
+      additionally writes the high half of the product to the high half of the
+      64-bit destination. Along with IADD.u32 and IADD.u64, this allows the
+      construction of a 64-bit multiply in 5 instructions (6 clocks).
+    </desc>
+    <src widen="true">A</src>
+    <src widen="true">B</src>
+    <saturate/>
+  </group>
+
+  <group name="HADD" title="Integer half-add" dests="1" opcode2="0x0B">
+    <ins name="HADD.u32" opcode="0xA0"/>
+    <ins name="HADD.v2u16" opcode="0xA1"/>
+    <ins name="HADD.v4u8" opcode="0xA2"/>
+    <ins name="HADD.s32" opcode="0xA8"/>
+    <ins name="HADD.v2s16" opcode="0xA9"/>
+    <ins name="HADD.v4s8" opcode="0x1A2"/>
+    <mod name="rhadd" start="30" size="1"/>
+    <src widen="true">A</src>
+    <src widen="true">B</src>
+    <desc>
+      $(A + B) \gg 1$ without intermediate overflow, corresponding to `hadd()` in
+      OpenCL. With the `.rhadd` modifier set, it instead calculates
+      $(A + B + 1) \gg 1$ corresponding to `rhadd()` in OpenCL.
+    </desc>
+  </group>
+
+  <group name="CLPER" title="Cross-lane permute" dests="1" opcode2="0xF">
+    <ins name="CLPER.i32" opcode="0xA0"/>
+    <ins name="CLPER.v2u16" opcode="0xA1"/>
+    <ins name="CLPER.v4u8" opcode="0xA2"/>
+    <ins name="CLPER.s32" opcode="0xA8"/>
+    <ins name="CLPER.v2s16" opcode="0xA9"/>
+    <ins name="CLPER.v4s8" opcode="0x1A2"/>
+    <ins name="CLPER.u64" opcode="0x1A3"/>
+    <ins name="CLPER.s64" opcode="0x1AB"/>
+    <!-- <ins name="CLPER.s32" opcode="0x1A0"/> -->
+    <desc>
+      Selects the value of A in the subgroup lane given by B. This implements
+      subgroup broadcasts. It may be used as a primitive for screen space
+      derivatives in fragment shaders.
+    </desc>
+    <src>A</src>
+    <src widen="true">B</src>
+    <subgroup/>
+    <lane_op/>
+    <inactive_result/>
+  </group>
+
+  <group name="FMA" title="Fused floating-point multiply add" dests="1">
+    <ins name="FMA.f32" opcode="0xB2"/>
+    <ins name="FMA.v2f16" opcode="0xB3"/>
+    <desc>$A \cdot B + C$</desc>
+    <clamp/>
+    <src absneg="true" swizzle="true">A</src>
+    <src absneg="true" swizzle="true">B</src>
+    <src absneg="true" swizzle="true">C</src>
+  </group>
+
+  <group name="LSHIFT_AND" title="Left shift and bitwise AND" dests="1" opcode2="0x100">
+    <ins name="LSHIFT_AND.i32" opcode="0xB4"/>
+    <ins name="LSHIFT_AND.v2i16" opcode="0xB5"/>
+    <ins name="LSHIFT_AND.v4i8" opcode="0xB6"/>
+    <ins name="LSHIFT_AND.i64" opcode="0x1B7"/>
+    <mod name="left" start="128" size="1" implied="true"/>
+    <desc>
+      Left shifts its first source by a specified amount and bitwise ANDs it with the
+      second source, optionally inverting the second source or the result.
+    </desc>
+    <not_result/>
+    <src widen="true">A</src>
+    <src lanes="true" size="8">shift</src>
+    <src not="true">B</src>
+  </group>
+
+  <group name="RSHIFT_AND" title="Right shift and bitwise AND" dests="1" opcode2="0x000">
+    <ins name="RSHIFT_AND.i32" opcode="0xB4"/>
+    <ins name="RSHIFT_AND.v2i16" opcode="0xB5"/>
+    <ins name="RSHIFT_AND.v4i8" opcode="0xB6"/>
+    <ins name="RSHIFT_AND.i64" opcode="0x1B7"/>
+    <mod name="left" start="128" size="1" implied="true"/>
+    <desc>
+      Right shifts its first source by a specified amount and bitwise ANDs it with the
+      second source, optionally inverting the second source or the result.
+    </desc>
+    <not_result/>
+    <src widen="true">A</src>
+    <src lanes="true" size="8">shift</src>
+    <src not="true">B</src>
+  </group>
+
+  <group name="LSHIFT_OR" title="Left shift and bitwise OR" dests="1" opcode2="0x101">
+    <ins name="LSHIFT_OR.i32" opcode="0xB4"/>
+    <ins name="LSHIFT_OR.v2i16" opcode="0xB5"/>
+    <ins name="LSHIFT_OR.v4i8" opcode="0xB6"/>
+    <ins name="LSHIFT_OR.i64" opcode="0x1B7"/>
+    <mod name="left" start="128" size="1" implied="true"/>
+    <desc>
+      Left shifts its first source by a specified amount and bitwise ORs it with the
+      second source, optionally inverting the second source or the result.
+    </desc>
+    <not_result/>
+    <src widen="true">A</src>
+    <src lanes="true" size="8">shift</src>
+    <src not="true">B</src>
+  </group>
+
+  <group name="RSHIFT_OR" title="Right shift and bitwise OR" dests="1" opcode2="0x001">
+    <ins name="RSHIFT_OR.i32" opcode="0xB4"/>
+    <ins name="RSHIFT_OR.v2i16" opcode="0xB5"/>
+    <ins name="RSHIFT_OR.v4i8" opcode="0xB6"/>
+    <ins name="RSHIFT_OR.i64" opcode="0x1B7"/>
+    <mod name="left" start="128" size="1" implied="true"/>
+    <desc>
+      Right shifts its first source by a specified amount and bitwise ORs it with the
+      second source, optionally inverting the second source or the result.
+    </desc>
+    <not_result/>
+    <src widen="true">A</src>
+    <src lanes="true" size="8">shift</src>
+    <src not="true">B</src>
+  </group>
+
+  <group name="LSHIFT_XOR" title="Left shift and bitwise XOR" dests="1" opcode2="0x102">
+    <ins name="LSHIFT_XOR.i32" opcode="0xB4"/>
+    <ins name="LSHIFT_XOR.v2i16" opcode="0xB5"/>
+    <ins name="LSHIFT_XOR.v4i8" opcode="0xB6"/>
+    <ins name="LSHIFT_XOR.i64" opcode="0x1B7"/>
+    <mod name="left" start="128" size="1" implied="true"/>
+    <desc>
+      Left shifts its first source by a specified amount and bitwise XORs it with the
+      second source, optionally inverting the second source or the result.
+    </desc>
+    <not_result/>
+    <src widen="true">A</src>
+    <src lanes="true" size="8">shift</src>
+    <src not="true">B</src>
+  </group>
+
+  <group name="RSHIFT_XOR" title="Right shift and bitwise XOR" dests="1" opcode2="0x002">
+    <ins name="RSHIFT_XOR.i32" opcode="0xB4"/>
+    <ins name="RSHIFT_XOR.v2i16" opcode="0xB5"/>
+    <ins name="RSHIFT_XOR.v4i8" opcode="0xB6"/>
+    <ins name="RSHIFT_XOR.i64" opcode="0x1B7"/>
+    <mod name="left" start="128" size="1" implied="true"/>
+    <desc>
+      Right shifts its first source by a specified amount and bitwise XORs it with the
+      second source, optionally inverting the second source or the result.
+    </desc>
+    <not_result/>
+    <src widen="true">A</src>
+    <src lanes="true" size="8">shift</src>
+    <src not="true">B</src>
+  </group>
+
+  <ins name="MUX.i32" title="Mux" dests="1" opcode="0xB8">
+    <desc>
+      Mux between A and B based on the provided mask. Equivalent to
+      `bitselect()` in OpenCL. `(A &amp; mask) | (A &amp; ~mask)`
+    </desc>
+    <src>A</src>
+    <src>B</src>
+    <src>Mask</src>
+  </ins>
+
+  <ins name="CUBE_SSEL" title="Cube S-coordinate select" dests="1" opcode="0xBC" opcode2="0">
+    <desc>During a cube map transform, select the S coordinate given a selected face.</desc>
+    <src absneg="true">Z coordinate as 32-bit floating point</src>
+    <src absneg="true">X coordinate as 32-bit floating point</src>
+    <src>Cube face index</src>
+  </ins>
+
+  <ins name="CUBE_TSEL" title="Cube T-coordinate select" dests="1" opcode="0xBC" opcode2="1">
+    <desc>During a cube map transform, select the T coordinate given a selected face.</desc>
+    <src absneg="true">Y coordinate as 32-bit floating point</src>
+    <src absneg="true">Z coordinate as 32-bit floating point</src>
+    <src>Cube face index</src>
+  </ins>
+
+  <ins name="MKVEC.v4i8" title="Make 8-bit vector" dests="1" opcode="0xBD">
+    <desc>
+      Calculates $A | (B \ll 8) | (CD \ll 16)$ for 8-bit A and B and 16-bit CD.
+
+      To implement `(uchar4) (A, B, C, D)` in full generality, use the sequence
+      `MKVEC.v4i8 CD, C, D, #0; MKVEC.v4i8 out, A, B, CD`
+
+      `MKVEC.v4i8` also allows zero extending arbitrary 8-bit lanes. For
+      example, to extend `r0.b3` to `r1`, use `MKVEC.v4i8 r1, r0.b3, 0x0.b0, 0x0`.
+    </desc>
+    <src lane="true">A</src>
+    <src lane="true">B</src>
+    <src>CD</src>
+  </ins>
+
+  <ins name="CUBEFACE1" title="Cube map transform step 1" dests="1" opcode="0xC0">
+    <desc>Select the maximum absolute value of its arguments.</desc>
+    <src absneg="true">X coordinate as 32-bit floating point</src>
+    <src absneg="true">Y coordinate as 32-bit floating point</src>
+    <src absneg="true">Z coordinate as 32-bit floating point</src>
+  </ins>
+
+  <ins name="CUBEFACE2" title="Cube map transform step 2" dests="1" opcode="0xC1">
+    <desc>Select the cube face index corresponding to the arguments.</desc>
+    <src absneg="true">X coordinate as 32-bit floating point</src>
+    <src absneg="true">Y coordinate as 32-bit floating point</src>
+    <src absneg="true">Z coordinate as 32-bit floating point</src>
+  </ins>
+
+  <group name="IDP" title="8-bit dot product" dests="1" opcode="0xC2">
+    <desc>
+      8-bit integer dot product between 4 channel vectors, intended for machine
+      learning. Available in both unsigned and signed variants, controlling
+      sign-extension/zero-extension behaviour to the final 32-bit destination.
+      Saturation is available. Corresponds to the `cl_arm_integer_dot_product_*`
+      family of OpenCL extensions. Not for actual use, just for completeness.
+      Instead, use your platform's neural accelerator.
+
+      For $A, B \in \{ 0, \ldots, 255 \}^4$ and $\text{Accumulator} \in
+      \mathbb{Z}$, calculates $(A \cdot B) + \text{Accumulator}$ and optionally
+      saturates.
+    </desc>
+    <ins name="IDP.v4s8" opcode2="0"/>
+    <ins name="IDP.v4u8" opcode2="1"/>
+    <src>A</src>
+    <src>B</src>
+    <src>Accumulator</src>
+    <saturate/>
+  </group>
+
+  <group name="ICMP" title="Unsigned integer compare" dests="1">
+    <desc>
+      Evaluates the given condition, do a logical and/or with the condition in
+      the result source, and return in the given result type (integer
+      one, integer minus one, or floating-point one). The third source is useful
+      for chaining together conditions without intermediate bitwise arithmetic;
+      when this is not desired, tie it to zero and use the OR combine mode (do
+      not set the `.and` modifier).
+
+      The sequence modifier `.seq` is used to construct 64-bit compares in 2
+      `ICMP.u32` instructions, in conjunction with the `u1` result type on the
+      low half, the `m1` result type on the high half, and the result of the low
+      half comparison passed as the third source. For comparisons other than
+      64-bit, do not set the `.seq` modifier and do not use the `u1` result
+      type.
+    </desc>
+    <ins name="ICMP.u32" opcode="0xF0"/>
+    <ins name="ICMP.v2u16" opcode="0xF1"/>
+    <ins name="ICMP.v4u8" opcode="0xF2"/>
+    <cmp/>
+    <result_type/>
+    <mod name="and" start="24" size="1"/>
+    <mod name="seq" start="25" size="1"/>
+    <src widen="true">A</src>
+    <src widen="true">B</src>
+    <src>C</src>
+  </group>
+
+  <group name="FCMP" title="Floating-point compare" dests="1">
+    <desc>
+      Evaluates the given condition, do a logical and/or with the condition in
+      the result source, and return in the given result type (integer
+      one, integer minus one, or floating-point one). The third source is useful
+      for chaining together conditions without intermediate bitwise arithmetic;
+      when this is not desired, tie it to zero and use the OR combine mode (do
+      not set the `.and` modifier).
+    </desc>
+    <ins name="FCMP.f32" opcode="0xF4"/>
+    <ins name="FCMP.v2f16" opcode="0xF5"/>
+    <cmp/>
+    <result_type/>
+    <mod name="and" start="24" size="1"/>
+    <src absneg="true" swizzle="true">A</src>
+    <src absneg="true" swizzle="true">B</src>
+    <src>C</src>
+  </group>
+
+  <group name="ICMP" title="Signed integer compare" dests="1">
+    <desc>
+      Evaluates the given condition, do a logical and/or with the condition in
+      the result source, and return in the given result type (integer
+      one, integer minus one, or floating-point one). The third source is useful
+      for chaining together conditions without intermediate bitwise arithmetic;
+      when this is not desired, tie it to zero and use the OR combine mode (do
+      not set the `.and` modifier).
+
+      The sequence modifier `.seq` is used to construct signed 64-bit compares
+      in 1 `ICMP.u32` and 1 `ICMP.s32` instruction, in conjunction with the `u1`
+      result type on the low half, the `m1` result type on the high half, and
+      the result of the low half comparison passed as the third source. For
+      comparisons other than 64-bit, do not set the `.seq` modifier and do not
+      use the `u1` result type.
+    </desc>
+    <ins name="ICMP.s32" opcode="0xF8"/>
+    <ins name="ICMP.v2s16" opcode="0xF9"/>
+    <ins name="ICMP.v4s8" opcode="0xFA"/>
+    <cmp/>
+    <result_type/>
+    <mod name="and" start="24" size="1"/>
+    <mod name="seq" start="25" size="1"/>
+    <src widen="true">A</src>
+    <src widen="true">B</src>
+    <src>C</src>
+  </group>
+
+  <ins name="IADD_IMM.i32" title="Integer addition with immediate" dests="1" opcode="0x110">
+    <desc>
+      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
+      If no modifiers are required, this is preferred to `IADD.i32` with a
+      constant accessed as a uniform. However, if the constant is available
+      inline, `IADD.f32` is preferred.
+
+      `IADD_IMM.i32` with the source tied to zero is the canonical immediate move.
+    </desc>
+    <src>A</src>
+    <imm name="constant" start="8" size="32"/>
+  </ins>
+
+  <ins name="IADD_IMM.v2i16" title="Integer addition with immediate" dests="1" opcode="0x111">
+    <desc>
+      Adds an arbitrary pair of 16-bit immediates embedded within the
+      instruction stream. If no modifiers are required, this is preferred to
+      `IADD.v2i16` with a constant accessed as a uniform. However, if the
+      constant is available inline, `IADD.v2i16` is preferred. Adding only a
+      single 16-bit constant requires replication of the constant.
+    </desc>
+    <src>A</src>
+    <imm name="constant" start="8" size="32"/>
+  </ins>
+
+  <ins name="IADD_IMM.v4i8" title="Integer addition with immediate" dests="1" opcode="0x112">
+    <desc>
+      Adds an arbitrary quad of 8-bit immediates embedded within the
+      instruction stream. If no modifiers are required, this is preferred to
+      `IADD.v4i8` with a constant accessed as a uniform. However, if the
+      constant is available inline, `IADD.v4i8` is preferred. Adding only a
+      single 8-bit constant requires replication of the constant.
+    </desc>
+    <src>A</src>
+    <imm name="constant" start="8" size="32"/>
+  </ins>
+
+  <ins name="FADD_IMM.f32" title="Floating-point addition with immediate" dests="1" opcode="0x114">
+    <desc>
+      Adds an arbitrary 32-bit immediate embedded within the instruction stream.
+      If no modifiers are required, this is preferred to `FADD.f32` with a
+      constant accessed as a uniform. However, if the constant is available
+      inline, `FADD.f32` is preferred.
+    </desc>
+    <src>A</src>
+    <imm name="constant" start="8" size="32"/>
+  </ins>
+
+  <ins name="FADD_IMM.v2f16" title="Floating-point addition with immediate" dests="1" opcode="0x115">
+    <desc>
+      Adds an arbitrary pair of 16-bit immediates embedded within the
+      instruction stream. If no modifiers are required, this is preferred to
+      `FADD.v2f16` with a constant accessed as a uniform. However, if the
+      constant is available inline, `FADD.v2f16` is preferred. Adding only a
+      single 16-bit constant requires replication of the constant.
+    </desc>
+    <src float="true">A</src>
+    <imm name="constant" start="8" size="32"/>
+  </ins>
+
+  <ins name="TODO.ATOM_C1" title="Atomic operations on memory with 1" opcode="0x69">
+    <!-- TODO -->
+    <mod name="i32" start="17" size="1"/>
+    <mod name="unk" start="23" size="1"/>
+    <sr write="true"/>
+    <src/>
+    <imm name="operation" start="24" size="6"/>
+    <sr_count/>
+    <slot/>
+  </ins>
+
+  <ins name="TODO.ATOM_C" title="Atomic operations on memory" opcode="0x120">
+    <!-- TODO -->
+    <mod name="i32" start="17" size="1"/>
+    <mod name="unk" start="23" size="1"/>
+    <sr read="true" write="true"/>
+    <src/>
+    <imm name="operation" start="24" size="6"/>
+    <sr_count/>
+    <slot/>
+  </ins>
+
+  <ins name="TEX_FETCH" title="Texel fetch" opcode="0x125">
+    <desc>Unfiltered textured instruction.</desc>
+    <sr read="true"/>
+    <sr write="true" count="4"/>
+    <mod name="explicit_offset" start="11" size="1"/>
+    <mod name="dimension" start="28" size="2"/>
+    <mod name="skip" start="39" size="1"/>
+    <sr_count/>
+    <slot/>
+    <src>Image to read from</src>
+  </ins>
+
+  <ins name="TEX" title="Texture load" opcode="0x128">
+    <desc>Ordinary texturing instruction using a sampler.</desc>
+    <sr read="true"/>
+    <sr write="true" count="4"/>
+    <src>Image to read from</src>
+    <mod name="explicit_offset" start="11" size="1"/>
+    <mod name="shadow" start="12" size="1"/>
+    <mod name="lod_mode" start="13" size="3"/>
+    <mod name="dimension" start="28" size="2"/>
+    <mod name="skip" start="39" size="1"/>
+    <sr_count/>
+    <slot/>
+  </ins>
+
+  <ins name="TODO.VAR_TEX" title="Fused varying-texturing" opcode="0x130">
+    <desc>Only works for FP32 varyings.</desc>
+    <sr write="true" count="4"/>
+    <mod name="dimension" start="28" size="2"/>
+    <mod name="skip" start="39" size="1"/>
+    <slot/>
+    <src>Image to read from</src>
+  </ins>
+
+  <ins name="FMA_RSCALE.f32" title="Fused floating-point multiply add with exponent bias" dests="1" opcode="0x160">
+    <desc>
+      First calculates $A \cdot B + C$ and then biases the exponent by D. Used in
+      special transcendental function sequences. It should not be used for
+      general code as its special case handling differs from two back-to-back
+      `FMA.f32` operations. Equivalent to `FMA.f32` back-to-back with
+      `RSCALE.f32`
+    </desc>
+    <clamp/>
+    <src absneg="true">A</src>
+    <src absneg="true">B</src>
+    <src absneg="true">C</src>
+    <src>D</src>
+  </ins>
+
+</valhall>
-- 
2.7.4