[NFC][AMDGPU] Pre-commit test for D132837.
authorThomas Symalla <thomas.symalla@amd.com>
Fri, 9 Sep 2022 12:08:10 +0000 (14:08 +0200)
committerThomas Symalla <thomas.symalla@amd.com>
Fri, 9 Sep 2022 12:09:02 +0000 (14:09 +0200)
llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll

index 0c79809..7c45f4a 100644 (file)
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
 ; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
 
@@ -8,27 +9,68 @@
 ; exists in the original program.
 
 ; (fadd (fma x, y, (fmul u, v), z) -> (fma x, y (fma u, v, z))
-
-; GCN-LABEL: {{^}}fast_add_fmuladd_fmul:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-FLUSH: v_mac_f32_e32 [[Z]], [[U]], [[V]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[Z]], [[X]], [[Y]]
-; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
-
-; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], [[Z]]
-; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
-; GCN-FASTFMA: buffer_store_dword [[FMA1]]
-
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
 define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
+; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_fma_f32 v2, v3, v4, v2
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -41,21 +83,68 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-FLUSH: v_mad_f32 [[TMP:v[0-9]]], [[U]], [[V]], -[[Z]]
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[TMP]], [[X]], [[Y]]
-; GCN-FLUSH-NEXT: buffer_store_dword [[Z]]
-
-; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
-; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
-; GCN-FASTFMA: buffer_store_dword [[FMA1]]
 define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
+; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mad_f32 v2, v3, v4, -v2
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v2, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_fma_f32 v2, v3, v4, -v2
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -68,26 +157,76 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
-; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[U]], [[Z]]
-
-; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[FMA1]], [[Z]]
-
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_mul:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v3, v2
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -101,26 +240,76 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_mul_commute:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-FLUSH-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FLUSH-DAG: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
-; GCN-FLUSH: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[U]]
-
-; GCN-FASTFMA: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[FMA1]]
-
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v2, v3
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v2, v0
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_mul_commute:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v2, v0
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -134,18 +323,76 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v3, v2
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v2
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -159,18 +406,76 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_mul_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_add_f32_e32
 define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+; GCN-FLUSH-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_add_f32_e32 v0, v2, v3
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-FASTFMA-NEXT:    v_add_f32_e32 v0, v2, v0
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_add_fmuladd_fmul_multi_use_fmuladd_commute:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v2, v0
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -184,28 +489,76 @@ define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_mul:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-
-; GCN-FLUSH: v_mad_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FLUSH: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
-
-; GCN-FASTFMA: v_fma_f32 [[MAD:v[0-9]+]], [[X]], [[Y]], [[MUL]]
-; GCN-FASTFMA: v_sub_f32_e32 [[SUB:v[0-9]+]], [[MAD]], [[Z]]
-
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_sub_f32_e32 [[SUB:v[0-9]+]]
-
-; GCN: buffer_store_dword [[MUL]]
-; GCN: buffer_store_dword [[SUB]]
 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_mul:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    v_mad_f32 v0, v0, v1, v3
+; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_mul:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_mul:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    v_sub_f32_e32 v0, v0, v2
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -219,33 +572,76 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
-; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[MUL]], [[Z]]
-; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
-; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
-; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_sub_f32_e32
 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 {
+; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, v3, v2
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    v_sub_f32_e32 v1, v0, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    v_sub_f32_e32 v1, v0, v2
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -259,33 +655,76 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_lhs() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_dword [[U:v[0-9]+]]
-; GCN: buffer_load_dword [[V:v[0-9]+]]
-
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[U]], [[V]]
-
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
-; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[Z]], [[MUL]]
-; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[U]]
-; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
-; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_sub_f32_e32
 define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 {
+; GCN-FLUSH-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, v2, v3
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v3
+; GCN-SLOWFMA-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -299,35 +738,82 @@ define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd_rhs() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_ushort [[U:v[0-9]+]]
-; GCN: buffer_load_ushort [[V:v[0-9]+]]
-
-; GCN-DAG: v_cvt_f32_f16_e32 [[UFLOAT:v[0-9]+]], [[U]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[VFLOAT:v[0-9]+]], [[V]]
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[UFLOAT]], [[VFLOAT]]
-
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
-; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[MUL]], [[Z]]
-; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
-; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[FMA]], [[Z]]
-; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_sub_f32_e32
 define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #0 {
+; GCN-FLUSH-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, v3, v2
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-FASTFMA-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    v_sub_f32_e32 v1, v0, v2
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-SLOWFMA-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v1, v3, v4
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_sub_f32_e32 v1, v0, v2
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -342,35 +828,82 @@ define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_lhs() #
   ret void
 }
 
-; GCN-LABEL: {{^}}fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
-; GCN: buffer_load_dword [[X:v[0-9]+]]
-; GCN: buffer_load_dword [[Y:v[0-9]+]]
-; GCN: buffer_load_dword [[Z:v[0-9]+]]
-; GCN: buffer_load_ushort [[U:v[0-9]+]]
-; GCN: buffer_load_ushort [[V:v[0-9]+]]
-
-; GCN-DAG: v_cvt_f32_f16_e32 [[UFLOAT:v[0-9]+]], [[U]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[VFLOAT:v[0-9]+]], [[V]]
-; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[UFLOAT]], [[VFLOAT]]
-
-; GCN-FLUSH-NEXT: v_mac_f32_e32 [[MUL]], [[X]], [[Y]]
-; GCN-FLUSH-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]],  [[Z]], [[MUL]]
-; GCN-FLUSH-NEXT: buffer_store_dword [[MUL]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-; GCN-FLUSH-NEXT: buffer_store_dword [[SUB]]
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-FASTFMA-NEXT: v_fma_f32 [[FMA:v[0-9]+]], [[X]], [[Y]], [[UFLOAT]]
-; GCN-FASTFMA-NEXT: v_sub_f32_e32 [[SUB:v[0-9]+]], [[Z]], [[FMA]]
-; GCN-FASTFMA-NEXT: buffer_store_dword [[FMA]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-; GCN-FASTFMA-NEXT: buffer_store_dword [[SUB]]
-; GCN-FASTFMA-NEXT: s_waitcnt vmcnt(0)
-
-; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[X]], [[Y]]
-; GCN-SLOWFMA: v_add_f32_e32
-; GCN-SLOWFMA: v_sub_f32_e32
 define amdgpu_kernel void @fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs() #0 {
+; GCN-FLUSH-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
+; GCN-FLUSH:       ; %bb.0:
+; GCN-FLUSH-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FLUSH-NEXT:    s_mov_b32 s2, -1
+; GCN-FLUSH-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 glc
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FLUSH-NEXT:    v_mac_f32_e32 v3, v0, v1
+; GCN-FLUSH-NEXT:    v_sub_f32_e32 v0, v2, v3
+; GCN-FLUSH-NEXT:    buffer_store_dword v3, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FLUSH-NEXT:    s_endpgm
+;
+; GCN-FASTFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
+; GCN-FASTFMA:       ; %bb.0:
+; GCN-FASTFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-FASTFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-FASTFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 glc
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-FASTFMA-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-FASTFMA-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GCN-FASTFMA-NEXT:    v_fma_f32 v0, v0, v1, v3
+; GCN-FASTFMA-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GCN-FASTFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-FASTFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-FASTFMA-NEXT:    s_endpgm
+;
+; GCN-SLOWFMA-LABEL: fast_sub_fmuladd_fpext_fmul_multi_use_fmuladd_rhs:
+; GCN-SLOWFMA:       ; %bb.0:
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-SLOWFMA-NEXT:    s_mov_b32 s2, -1
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_dword v2, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_ushort v3, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 glc
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GCN-SLOWFMA-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_mul_f32_e32 v1, v3, v4
+; GCN-SLOWFMA-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-SLOWFMA-NEXT:    v_sub_f32_e32 v1, v2, v0
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GCN-SLOWFMA-NEXT:    s_waitcnt vmcnt(0)
+; GCN-SLOWFMA-NEXT:    s_endpgm
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -390,3 +923,5 @@ declare float @llvm.fmuladd.f32(float, float, float) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}