<constant desc="Half-float $\pi$">0x42480000</constant>
</lut>
- <enum name="Action">
+ <enum name="Flow">
<desc>
- Every Valhall instruction can perform an action, like wait on dependency
- slots. A few special actions are available, specified in the instruction
- metadata from this enum. The `wait0126` action is required to wait on
+ Every Valhall instruction can wait on dependency
+ slots. A few special flows are available, specified in the instruction
+ metadata from this enum. The `wait0126` flow is required to wait on
dependency slot #6 and should be set on the instruction immediately
- preceding `ATEST`. The `barrier` action may be set on any instruction for
- subgroup barriers, and should particularly be set with the `BARRIER`
- instruction for global barriers. The `td` action only applies to fragment
- shaders and is used to terminate helper invocations, it should be set as
- early as possible after helper invocations are no longer needed as
- determined by data flow analysis. The `return` action is used to terminate
- the shader, although it may be overloaded by the `BLEND` instruction.
-
- The `reconverge` action is required on any instruction immediately
+ preceding `ATEST`. The `wait` flow should be set for barriers.
+ The `discard` flow only applies to fragment shaders and is used to
+ terminate helper invocations, it should be set as early as possible after
+ helper invocations are no longer needed as determined by data flow
+ analysis. The `end` flow is used to terminate the shader, although it
+ may be overloaded by the `BLEND` instruction.
+
+ The `reconverge` flow is required on any instruction immediately
preceding a possible change to the mask of active threads in a subgroup.
This includes all divergent branches, but it also includes the final
instruction at the end of any basic block where the immediate successor
(fallthrough) is the target of a divergent branch.
</desc>
- <value name="Wait on all dependency slots">wait0126</value>
- <value name="Subgroup barrier">barrier</value>
+ <value name="None" default="true">none</value>
+ <value name="Wait on slot 0">wait0</value>
+ <value name="Wait on slot 1">wait1</value>
+ <value name="Wait on slots 0, 1">wait01</value>
+ <value name="Wait on slot 2">wait2</value>
+ <value name="Wait on slots 0, 2">wait02</value>
+ <value name="Wait on slots 1, 2">wait12</value>
+ <value name="Wait on slots 0, 1, 2">wait012</value>
+ <value name="Wait on slots 0, 1, 2, 6">wait0126</value>
+ <value name="Wait on slots 0, 1, 2, 6, 7">wait</value>
<value name="Perform branch reconverge">reconverge</value>
<reserved/>
<reserved/>
- <value name="Terminate discarded threads">td</value>
+ <value name="Terminate discarded threads">discard</value>
<reserved/>
- <value name="Return from shader">return</value>
+ <value name="Return from shader">end</value>
</enum>
<enum name="FAU special page 0">
<ins name="BARRIER" title="Execution and memory barrier" opcode="0x45" unit="NONE">
<desc>
General-purpose barrier. Must use slot #7. Must be paired with a
- `.barrier` action on the instruction.
+ `.wait` flow on the instruction.
</desc>
<slot/>
</ins>
82 3c 27 20 00 c0 a3 01 SHADDX.u64 r0, u2, r60.w0, shift:0x2
40 00 00 18 82 80 60 08 LOAD.i32.unsigned.slot0.wait0 @r0, `r0, offset:0
80 7c 47 20 00 c0 a3 01 SHADDX.u64 r0, u0, `r60.w0, shift:0x4
-40 00 00 38 08 44 61 78 STORE.i128.slot0.return @r4:r5:r6:r7, `r0, offset:0
-00 00 00 00 00 c0 00 78 NOP.return
+40 00 00 38 08 44 61 78 STORE.i128.slot0.end @r4:r5:r6:r7, `r0, offset:0
+00 00 00 00 00 c0 00 78 NOP.end
40 c4 c0 9c 01 c1 f0 00 ICMP.u32.gt.m1 r1, `r0, 0x1000000.b3, 0x0
42 00 00 18 02 40 61 50 STORE.i32.slot0.reconverge @r0, `r2, offset:0
00 c9 8f 12 30 c0 a0 00 CLPER.i32.f1 r0, r0, 0x7060504.b0
40 00 0b 10 00 c3 90 00 F16_TO_F32 r3, `r0.h1
00 00 00 00 00 c0 00 40 NOP.wait0126
42 43 04 00 00 c0 a5 00 V2F32_TO_V2F16 r0, `r2, `r3
-40 c0 00 28 90 c0 a5 48 FADD.v2f16.barrier r0, `r0.abs, 0x0.neg
+40 c0 00 28 90 c0 a5 48 FADD.v2f16.wait r0, `r0.abs, 0x0.neg
c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0
-3c d0 ea 00 02 bc 7d 68 ATEST.td @r60, r60, 0x3F800000, atest_datum.w0
+3c d0 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, 0x3F800000, atest_datum.w0
40 db 05 04 00 c1 a1 00 MKVEC.v2i16 r1, `r0.h00, 0x3C000000.h10
-f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.return @r0:r1, blend_descriptor_0.w0, r60, target:0x0
+f0 00 3c 33 04 40 7f 78 BLEND.slot0.v4.f16.end @r0:r1, blend_descriptor_0.w0, r60, target:0x0
7b 0d 00 40 04 84 5e 08 LEA_BUF_IMM.slot1.wait0 @r4:r5, `r59, table:0xD, index:0x0
00 dd c0 08 14 c2 b2 00 FMA.f32 r2, r0, 0x44000000.neg.h1, 0x0.neg
41 88 c0 00 04 c1 b2 00 FMA.f32 r1, `r1, u8, 0x0.neg
40 88 c0 00 04 c0 b2 10 FMA.f32.wait1 r0, `r0, u8, 0x0.neg
-44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.return @r0:r1:r2, `r4, offset:0
-44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.return @r8:r9:r10:r11, `r4, offset:0
-00 00 00 c0 01 c0 45 48 BARRIER.slot7.barrier
+44 00 00 32 06 40 61 78 STORE.i96.estream.slot0.end @r0:r1:r2, `r4, offset:0
+44 00 00 39 08 48 61 78 STORE.i128.istream.slot0.end @r8:r9:r10:r11, `r4, offset:0
+00 00 00 c0 01 c0 45 48 BARRIER.slot7.wait
80 00 00 00 82 82 60 00 LOAD.i8.unsigned.slot0 @r2, u0, offset:0
80 00 00 08 82 82 60 00 LOAD.i16.unsigned.slot0 @r2, u0, offset:0
80 00 00 10 82 82 60 00 LOAD.i24.unsigned.slot0 @r2, u0, offset:0
42 00 00 38 08 44 61 00 STORE.i128.slot0 @r4:r5:r6:r7, `r2, offset:0
41 f8 ff ff 07 c0 1f 50 BRANCHZ.reconverge `r1, offset:-8
7d c0 00 08 10 bc a1 00 IADD.v2u16 r60.h1, `r61.h10, 0x0
-44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0
-44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0
-44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.return @r0:r1:r2:r3, `r4, `r6, offset:0x0
+44 00 46 32 28 40 71 78 ST_CVT.slot0.istream.v4.f32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0
+44 00 46 34 28 40 71 78 ST_CVT.slot0.istream.v4.s32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0
+44 00 46 36 28 40 71 78 ST_CVT.slot0.istream.v4.u32.end @r0:r1:r2:r3, `r4, `r6, offset:0x0
7c c0 12 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x1
7c c0 02 00 26 84 67 00 LEA_TEX_IMM.slot0 @r4:r5:r6, `r60, 0x0, table:0x2, index:0x0
82 81 00 28 f4 82 6a 00 LD_BUFFER.i64.unsigned.slot0 @r2:r3, u2, u1
40 44 80 00 01 c0 b8 00 MUX.i32 r0, `r0, `r4, u0
40 44 80 00 02 c0 b8 00 MUX.i32.fp_zero r0, `r0, `r4, u0
40 44 80 00 03 c0 b8 00 MUX.i32.bit r0, `r0, `r4, u0
-00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.td r1, r0
+00 00 00 01 00 c1 99 68 FREXPM.f32.sqrt.discard r1, r0
01 00 02 00 00 c2 9c 00 FRSQ.f32 r2, r1
40 00 02 01 00 c0 99 00 FREXPE.f32.sqrt r0, `r0
41 42 c0 40 04 c0 62 41 FMA_RSCALE_LEFT.f32.wait0126 r0, `r1, `r2, 0x0.neg, `r0
00 00 03 00 20 c1 90 00 V2S8_TO_V2F16 r1, r0.b20
40 00 03 00 60 c0 90 00 V2S8_TO_V2F16 r0, `r0.b21
-3d 00 00 b2 88 80 5c 68 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.td @r0:r1:r2:r3, r61, index:0x0
+3d 00 00 b2 88 80 5c 68 LD_VAR_BUF_IMM.f32.slot2.v4.src_f32.sample.store.discard @r0:r1:r2:r3, r61, index:0x0
3d 00 10 72 18 84 5c 00 LD_VAR_BUF_IMM.f32.slot1.v4.src_f32.center.retrieve @r4:r5:r6:r7, r61, index:0x10
c0 00 00 00 00 c8 10 01 IADD_IMM.i32 r8, 0x0, #0x0
c0 00 00 00 00 c9 10 01 IADD_IMM.i32 r9, 0x0, #0x0
3d 00 14 00 00 ca 90 00 U16_TO_U32 r10, r61.h00
3d 09 00 00 30 c0 1f 50 BRANCHZ.eq.reconverge r61.h0, offset:9
0a 00 00 00 00 cb 91 50 MOV.i32.reconverge r11, r10
-00 00 00 00 00 c0 00 48 NOP.barrier
+00 00 00 00 00 c0 00 48 NOP.wait
81 0b 80 33 04 8e 78 00 LD_TILE.v4.f16.slot0 @r14:r15, u1, r11, u0
0b 00 04 00 00 cc 91 00 CLZ.u32 r12, r11
82 4c c0 52 00 cc b4 00 RSHIFT_XOR.i32.not_result r12, u2, `r12.b00, 0x0
49 3e c0 22 04 c9 b3 30 FMA.v2f16.wait12 r9, `r9, r62.h00, 0x0.neg
47 43 00 00 00 c3 a4 00 FADD.f32 r3, `r7, `r3
43 09 00 08 00 c3 a4 40 FADD.f32.wait0126 r3, `r3, r9.h1
-3c 03 ea 00 02 bc 7d 68 ATEST.td @r60, r60, r3, atest_datum.w0
+3c 03 ea 00 02 bc 7d 68 ATEST.discard @r60, r60, r3, atest_datum.w0
46 42 00 00 00 c2 a4 00 FADD.f32 r2, `r6, `r2
44 40 00 00 00 c0 a4 00 FADD.f32 r0, `r4, `r0
48 7e c0 22 04 ff b3 00 FMA.v2f16 r63, `r8, `r62.h00, 0x0.neg
45 41 00 00 00 c1 a4 00 FADD.f32 r1, `r5, `r1
41 3f 00 08 00 c1 a4 00 FADD.f32 r1, `r1, r63.h1
40 7f 00 04 00 c0 a4 00 FADD.f32 r0, `r0, `r63.h0
-42 49 00 04 00 c2 a4 48 FADD.f32.barrier r2, `r2, `r9.h0
-f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.return @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
+42 49 00 04 00 c2 a4 48 FADD.f32.wait r2, `r2, `r9.h0
+f0 00 3c 32 08 40 7f 78 BLEND.slot0.v4.f32.end @r0:r1:r2:r3, blend_descriptor_0.w0, r60, target:0x0
c0 00 00 00 00 f6 10 01 IADD_IMM.i32 r54, 0x0, #0x0
c0 f1 00 00 10 c1 2f 08 BRANCHZI.eq.absolute.wait0 0x0, blend_descriptor_0.w1
80 00 c0 17 34 7c 25 01 TEX_FETCH.slot0.f.32.2d @r0:r1:r2:r3, @r60:r61, u0