; GCN-LABEL: {{^}}addMul2D:
; GFX1010: v_fmac_f16
; GFX1010: v_fmac_f16
-define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly, float addrspace(4)* nocapture readonly, <2 x i32>, i32) local_unnamed_addr #0 {
- %5 = extractelement <2 x i32> %2, i64 1
- %6 = icmp sgt i32 %5, 0
- br i1 %6, label %7, label %38
+define hidden <4 x half> @addMul2D(<4 x i8>* nocapture readonly %arg, float addrspace(4)* nocapture readonly %arg1, <2 x i32> %arg2, i32 %arg3) local_unnamed_addr #0 {
+bb:
+ %tmp = extractelement <2 x i32> %arg2, i64 1
+ %tmp4 = icmp sgt i32 %tmp, 0
+ br i1 %tmp4, label %bb5, label %bb36
-7: ; preds = %4
- %8 = extractelement <2 x i32> %2, i64 0
- %9 = icmp sgt i32 %8, 0
- br label %10
+bb5: ; preds = %bb
+ %tmp6 = extractelement <2 x i32> %arg2, i64 0
+ %tmp7 = icmp sgt i32 %tmp6, 0
+ br label %bb8
-10: ; preds = %34, %7
- %11 = phi <4 x half> [ zeroinitializer, %7 ], [ %35, %34 ]
- %12 = phi i32 [ 0, %7 ], [ %36, %34 ]
- br i1 %9, label %13, label %34
+bb8: ; preds = %bb32, %bb5
+ %tmp9 = phi <4 x half> [ zeroinitializer, %bb5 ], [ %tmp33, %bb32 ]
+ %tmp10 = phi i32 [ 0, %bb5 ], [ %tmp34, %bb32 ]
+ br i1 %tmp7, label %bb11, label %bb32
-13: ; preds = %10
- %14 = mul nsw i32 %12, %3
- %15 = mul nsw i32 %12, %8
- br label %16
+bb11: ; preds = %bb8
+ %tmp12 = mul nsw i32 %tmp10, %arg3
+ %tmp13 = mul nsw i32 %tmp10, %tmp6
+ br label %bb14
-16: ; preds = %16, %13
- %17 = phi <4 x half> [ %11, %13 ], [ %31, %16 ]
- %18 = phi i32 [ 0, %13 ], [ %32, %16 ]
- %19 = add nsw i32 %18, %14
- %20 = sext i32 %19 to i64
- %21 = getelementptr inbounds <4 x i8>, <4 x i8>* %0, i64 %20
- %22 = load <4 x i8>, <4 x i8>* %21, align 4
- %23 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %22) #8
- %24 = add nsw i32 %18, %15
- %25 = sext i32 %24 to i64
- %26 = getelementptr inbounds float, float addrspace(4)* %1, i64 %25
- %27 = load float, float addrspace(4)* %26, align 4
- %28 = fptrunc float %27 to half
- %29 = insertelement <4 x half> undef, half %28, i32 0
- %30 = shufflevector <4 x half> %29, <4 x half> undef, <4 x i32> zeroinitializer
- %31 = tail call <4 x half> @llvm.fmuladd.v4f16(<4 x half> %23, <4 x half> %30, <4 x half> %17)
- %32 = add nuw nsw i32 %18, 1
- %33 = icmp eq i32 %32, %8
- br i1 %33, label %34, label %16
+bb14: ; preds = %bb14, %bb11
+ %tmp15 = phi <4 x half> [ %tmp9, %bb11 ], [ %tmp29, %bb14 ]
+ %tmp16 = phi i32 [ 0, %bb11 ], [ %tmp30, %bb14 ]
+ %tmp17 = add nsw i32 %tmp16, %tmp12
+ %tmp18 = sext i32 %tmp17 to i64
+ %tmp19 = getelementptr inbounds <4 x i8>, <4 x i8>* %arg, i64 %tmp18
+ %tmp20 = load <4 x i8>, <4 x i8>* %tmp19, align 4
+ %tmp21 = tail call <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %tmp20)
+ %tmp22 = add nsw i32 %tmp16, %tmp13
+ %tmp23 = sext i32 %tmp22 to i64
+ %tmp24 = getelementptr inbounds float, float addrspace(4)* %arg1, i64 %tmp23
+ %tmp25 = load float, float addrspace(4)* %tmp24, align 4
+ %tmp26 = fptrunc float %tmp25 to half
+ %tmp27 = insertelement <4 x half> undef, half %tmp26, i32 0
+ %tmp28 = shufflevector <4 x half> %tmp27, <4 x half> undef, <4 x i32> zeroinitializer
+ %vec.A.0 = extractelement <4 x half> %tmp21, i32 0
+ %vec.B.0 = extractelement <4 x half> %tmp28, i32 0
+ %vec.C.0 = extractelement <4 x half> %tmp15, i32 0
+ %vec.res.0 = tail call half @llvm.fmuladd.f16(half %vec.A.0, half %vec.B.0, half %vec.C.0)
+ %vec.A.1 = extractelement <4 x half> %tmp21, i32 1
+ %vec.B.1 = extractelement <4 x half> %tmp28, i32 1
+ %vec.C.1 = extractelement <4 x half> %tmp15, i32 1
+ %vec.res.1 = tail call half @llvm.fmuladd.f16(half %vec.A.1, half %vec.B.1, half %vec.C.1)
+ %vec.A.2 = extractelement <4 x half> %tmp21, i32 2
+ %vec.B.2 = extractelement <4 x half> %tmp28, i32 2
+ %vec.C.2 = extractelement <4 x half> %tmp15, i32 2
+ %vec.res.2 = tail call half @llvm.fmuladd.f16(half %vec.A.2, half %vec.B.2, half %vec.C.2)
+ %vec.A.3 = extractelement <4 x half> %tmp21, i32 3
+ %vec.B.3 = extractelement <4 x half> %tmp28, i32 3
+ %vec.C.3 = extractelement <4 x half> %tmp15, i32 3
+ %vec.res.3 = tail call half @llvm.fmuladd.f16(half %vec.A.3, half %vec.B.3, half %vec.C.3)
+ %full.res.0 = insertelement <4 x half> undef, half %vec.res.0, i32 0
+ %full.res.1 = insertelement <4 x half> %full.res.0, half %vec.res.1, i32 1
+ %full.res.2 = insertelement <4 x half> %full.res.1, half %vec.res.2, i32 2
+ %tmp29 = insertelement <4 x half> %full.res.2, half %vec.res.3, i32 3
+ %tmp30 = add nuw nsw i32 %tmp16, 1
+ %tmp31 = icmp eq i32 %tmp30, %tmp6
+ br i1 %tmp31, label %bb32, label %bb14
-34: ; preds = %16, %10
- %35 = phi <4 x half> [ %11, %10 ], [ %31, %16 ]
- %36 = add nuw nsw i32 %12, 1
- %37 = icmp eq i32 %36, %5
- br i1 %37, label %38, label %10
+bb32: ; preds = %bb14, %bb8
+ %tmp33 = phi <4 x half> [ %tmp9, %bb8 ], [ %tmp29, %bb14 ]
+ %tmp34 = add nuw nsw i32 %tmp10, 1
+ %tmp35 = icmp eq i32 %tmp34, %tmp
+ br i1 %tmp35, label %bb36, label %bb8
-38: ; preds = %34, %4
- %39 = phi <4 x half> [ zeroinitializer, %4 ], [ %35, %34 ]
- ret <4 x half> %39
+bb36: ; preds = %bb32, %bb
+ %tmp37 = phi <4 x half> [ zeroinitializer, %bb ], [ %tmp33, %bb32 ]
+ ret <4 x half> %tmp37
}
-define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8>) local_unnamed_addr #1 {
- %2 = extractelement <4 x i8> %0, i64 0
- %3 = uitofp i8 %2 to half
- %4 = insertelement <4 x half> undef, half %3, i32 0
- %5 = extractelement <4 x i8> %0, i64 1
- %6 = uitofp i8 %5 to half
- %7 = insertelement <4 x half> %4, half %6, i32 1
- %8 = extractelement <4 x i8> %0, i64 2
- %9 = uitofp i8 %8 to half
- %10 = insertelement <4 x half> %7, half %9, i32 2
- %11 = extractelement <4 x i8> %0, i64 3
- %12 = uitofp i8 %11 to half
- %13 = insertelement <4 x half> %10, half %12, i32 3
- ret <4 x half> %13
+; Function Attrs: norecurse nounwind readnone
+define linkonce_odr hidden <4 x half> @_Z13convert_half4Dv4_h(<4 x i8> %arg) local_unnamed_addr #1 {
+bb:
+ %tmp = extractelement <4 x i8> %arg, i64 0
+ %tmp1 = uitofp i8 %tmp to half
+ %tmp2 = insertelement <4 x half> undef, half %tmp1, i32 0
+ %tmp3 = extractelement <4 x i8> %arg, i64 1
+ %tmp4 = uitofp i8 %tmp3 to half
+ %tmp5 = insertelement <4 x half> %tmp2, half %tmp4, i32 1
+ %tmp6 = extractelement <4 x i8> %arg, i64 2
+ %tmp7 = uitofp i8 %tmp6 to half
+ %tmp8 = insertelement <4 x half> %tmp5, half %tmp7, i32 2
+ %tmp9 = extractelement <4 x i8> %arg, i64 3
+ %tmp10 = uitofp i8 %tmp9 to half
+ %tmp11 = insertelement <4 x half> %tmp8, half %tmp10, i32 3
+ ret <4 x half> %tmp11
}
-declare <4 x half> @llvm.fmuladd.v4f16(<4 x half>, <4 x half>, <4 x half>)
+declare half @llvm.fmuladd.f16(half, half, half)
attributes #0 = { convergent nounwind readonly}
attributes #1 = { norecurse nounwind readnone }
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SIVI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VIGFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VIGFX9 %s
declare half @llvm.fma.f16(half %a, half %b, half %c)
declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+declare <4 x half> @llvm.fma.v4f16(<4 x half> %a, <4 x half> %b, <4 x half> %c)
; GCN-LABEL: {{^}}fma_f16
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
+; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16(
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[B_F32:[0-9]]], s[[A_F32:[0-9]]], v[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
-; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
+; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
+; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], s[[A_F16]], v[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16_imm_a(
; SI: v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], s[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
-; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
+; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
+; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], s[[B_F16]], v[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16_imm_b(
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
; SI: v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], s[[C_F32:[0-9]]]
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
-; VI: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
+; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
+; VIGFX9: v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], s[[C_F16]]
; GCN: buffer_store_short v[[R_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_f16_imm_c(
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
+
+; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16(
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[A_F32:[0-9]+]], 0x40400000{{$}}
-; VI: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VIGFX9: s_movk_i32 s[[A_F16:[0-9]+]], 0x4200{{$}}
+; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SIVI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], s[[A_F16]], v[[B_F16_1]]
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], s[[A_F16]], v[[B_V2_F16]]
+
+; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_a(
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; VIGFX9: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[B_F32:[0-9]+]], 0x40400000{{$}}
-; VI: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
+; VIGFX9: s_movk_i32 s[[B_F16:[0-9]+]], 0x4200{{$}}
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], s[[B_F16]], v[[C_F16_1]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], s[[B_F16]], v[[C_V2_F16]]
+
+; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
+; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_b(
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VIGFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; VIGFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: s_mov_b32 s[[C_F32:[0-9]+]], 0x40400000{{$}}
-; VI: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
+; VIGFX9: s_movk_i32 s[[C_F16:[0-9]+]], 0x4200{{$}}
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; GCN-NOT: and
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+; GFX9: v_pk_fma_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], s[[C_F16]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
store <2 x half> %r.val, <2 x half> addrspace(1)* %r
ret void
}
+
+; GCN-LABEL: {{^}}fma_v4f16
+; GCN: buffer_load_dwordx2 v{{\[}}[[A_V4_F16_LO:[0-9]+]]:[[A_V4_F16_HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v{{\[}}[[B_V4_F16_LO:[0-9]+]]:[[B_V4_F16_HI:[0-9]+]]{{\]}}
+; GCN: buffer_load_dwordx2 v{{\[}}[[C_V4_F16_LO:[0-9]+]]:[[C_V4_F16_HI:[0-9]+]]{{\]}}
+
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V4_F16_LO]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V4_F16_HI]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_2:[0-9]+]], 16, v[[A_V4_F16_HI]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V4_F16_LO]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V4_F16_LO]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_V4_F16_HI]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_V4_F16_HI]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[A_V4_F16_LO]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[A_V4_F16_HI]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_2:[0-9]+]], v[[B_V4_F16_LO]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_3:[0-9]+]], v[[B_V4_F16_HI]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_2:[0-9]+]], v[[C_V4_F16_LO]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_3:[0-9]+]], v[[C_V4_F16_HI]]
+
+; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
+; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
+; SI-DAG: v_fma_f32 v[[R_F32_2:[0-9]+]], v[[A_F32_2]], v[[B_F32_2]], v[[C_F32_2]]
+; SI-DAG: v_fma_f32 v[[R_F32_3:[0-9]+]], v[[A_F32_3]], v[[B_F32_3]], v[[C_F32_3]]
+
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_2:[0-9]+]], v[[R_F32_2]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[R_F32_3]]
+
+; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_0:[0-9]]], 16, v[[R_F16_2]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R1_F16_1:[0-9]]], 16, v[[R_F16_3]]
+
+; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_0:[0-9]+]], 16, v[[A_V4_F16_LO]]
+; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V4_F16_HI]]
+; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_0:[0-9]+]], 16, v[[B_V4_F16_LO]]
+; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V4_F16_HI]]
+; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_0:[0-9]+]], 16, v[[C_V4_F16_LO]]
+; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V4_F16_HI]]
+
+; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
+; VI-DAG: v_fma_f16 v[[R1_F16_0:[0-9]+]], v[[A_F16_0]], v[[B_F16_0]], v[[C_F16_0]]
+; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
+; VI-DAG: v_fma_f16 v[[R1_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
+
+; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_LO:[0-9]+]], v[[R_F16_0]], v[[R1_F16_0]]
+; SIVI-DAG: v_or_b32_e32 v[[R_V4_F16_HI:[0-9]+]], v[[R_F16_1]], v[[R1_F16_1]]
+
+; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_LO:[0-9]+]], v[[A_V4_F16_LO]], v[[B_V4_F16_LO]], v[[C_V4_F16_LO]]
+; GFX9-DAG: v_pk_fma_f16 v[[R_V4_F16_HI:[0-9]+]], v[[A_V4_F16_HI]], v[[B_V4_F16_HI]], v[[C_V4_F16_HI]]
+
+; GCN: buffer_store_dwordx2 v{{\[}}[[R_V4_F16_LO]]:[[R_V4_F16_HI]]{{\]}}
+; GCN: s_endpgm
+
+define amdgpu_kernel void @fma_v4f16(
+ <4 x half> addrspace(1)* %r,
+ <4 x half> addrspace(1)* %a,
+ <4 x half> addrspace(1)* %b,
+ <4 x half> addrspace(1)* %c) {
+ %a.val = load <4 x half>, <4 x half> addrspace(1)* %a
+ %b.val = load <4 x half>, <4 x half> addrspace(1)* %b
+ %c.val = load <4 x half>, <4 x half> addrspace(1)* %c
+ %r.val = call <4 x half> @llvm.fma.v4f16(<4 x half> %a.val, <4 x half> %b.val, <4 x half> %c.val)
+ store <4 x half> %r.val, <4 x half> addrspace(1)* %r
+ ret void
+}