; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
-; GCN-COUNT32: v_accvgpr_write_b32
+
+; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
+; 3 vgprs are needed to avoid wait states between writes.
+
+; FIXME: We should not be using and temporary registers at all.
+; At the moment we initialize an sgpr, then copy it via vgprs.
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2:v[0-9]+]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3:v[0-9]+]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1:v[0-9]+]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
+; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
+
+; Check that we do not copy agprs to vgprs and back inside the loop.
+
; GCN: [[LOOP:BB[0-9_]+]]:
; GCN-NOT: v_accvgpr
; GCN: v_mfma_f32_32x32x1f32
; GCN-NOT: v_accvgpr
; GCN: s_cbranch_scc1 [[LOOP]]
+
+; Final result should be read only once after the loop.
+
; GCN-COUNT32: v_accvgpr_read_b32
+
define amdgpu_kernel void @test_mfma_loop_zeroinit(<32 x float> addrspace(1)* %arg) {
entry:
br label %for.cond.preheader