From: Piotr Sobczak <Piotr.Sobczak@amd.com>
Date: Fri, 1 Jul 2022 06:18:03 +0000 (+0200)
Subject: [AMDGPU] Update WMMA intrinsics with explicit f16 types
X-Git-Tag: upstream/15.0.7~2919
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b6ef36a1c427d07116fea84623b7caa37d8a7d7b;p=platform%2Fupstream%2Fllvm.git

[AMDGPU] Update WMMA intrinsics with explicit f16 types

Update intrinsics to use n x f16 and n x i16 instead
of 32-bit types. This may avoid the need for a bitcast
and is probably less confusing.

Depends on making v16f16 and v16i16 types legal.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D128951
---

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 2ca3bdc..c2dcfc2 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2023,10 +2023,10 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
     [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>]
 >;
 
-def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v8f32_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v8i32_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v8f32_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v8i32_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 9c8d3b4..29ee9f1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -329,13 +329,14 @@ def SIfptrunc_round_downward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_DOWNWARD",
 //===----------------------------------------------------------------------===//
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
-// XXX - do f16 instructions?
 class isFloatType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, f16.Value),
                 !eq(SrcVT.Value, f32.Value),
                 !eq(SrcVT.Value, f64.Value),
                 !eq(SrcVT.Value, v2f16.Value),
                 !eq(SrcVT.Value, v4f16.Value),
+                !eq(SrcVT.Value, v8f16.Value),
+                !eq(SrcVT.Value, v16f16.Value),
                 !eq(SrcVT.Value, v2f32.Value),
                 !eq(SrcVT.Value, v4f32.Value),
                 !eq(SrcVT.Value, v8f32.Value),
@@ -343,10 +344,14 @@ class isFloatType<ValueType SrcVT> {
                 !eq(SrcVT.Value, v4f64.Value));
 }
 
+// XXX - do v2i16 instructions?
 class isIntType<ValueType SrcVT> {
   bit ret = !or(!eq(SrcVT.Value, i16.Value),
                 !eq(SrcVT.Value, i32.Value),
                 !eq(SrcVT.Value, i64.Value),
+                !eq(SrcVT.Value, v4i16.Value),
+                !eq(SrcVT.Value, v8i16.Value),
+                !eq(SrcVT.Value, v16i16.Value),
                 !eq(SrcVT.Value, v2i32.Value),
                 !eq(SrcVT.Value, v4i32.Value),
                 !eq(SrcVT.Value, v8i32.Value));
@@ -1733,10 +1738,16 @@ class isModifierType<ValueType SrcVT> {
                 !eq(SrcVT.Value, v2i16.Value),
                 !eq(SrcVT.Value, v2f32.Value),
                 !eq(SrcVT.Value, v2i32.Value),
+                !eq(SrcVT.Value, v4f16.Value),
+                !eq(SrcVT.Value, v4i16.Value),
                 !eq(SrcVT.Value, v4f32.Value),
                 !eq(SrcVT.Value, v4i32.Value),
+                !eq(SrcVT.Value, v8f16.Value),
+                !eq(SrcVT.Value, v8i16.Value),
                 !eq(SrcVT.Value, v8f32.Value),
-                !eq(SrcVT.Value, v8i32.Value));
+                !eq(SrcVT.Value, v8i32.Value),
+                !eq(SrcVT.Value, v16f16.Value),
+                !eq(SrcVT.Value, v16i16.Value));
 }
 
 // Return type of input modifiers operand for specified input operand
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 0651d2e..59ce532 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -684,20 +684,24 @@ class VOPProfileWMMA<VOPProfile P, string Suffix, RegisterOperand _Src01RC64, bi
   let Src2RC64 = !if(!eq(Suffix, "_w32"), VISrc_256_f64, VISrc_128_f32);
   let HasClamp = _HasClamp;
   let HasOpSel = _HasOpSel;
+  let IsPacked = 1;
   let IsWMMA = 1;
 }
 
-def VOP_V8F32_V8F32_V8F32_V8F32 : VOPProfile <[v8f32, v8f32, v8f32, v8f32]>;
-def VOP_V8F32_V8I32_V8I32_V8F32 : VOPProfile <[v8f32, v8i32, v8i32, v8f32]>;
+def VOP_V8F32_V16F16_V16F16_V8F32 : VOPProfile <[v8f32, v16f16, v16f16, v8f32]>;
+def VOP_V8F32_V16I16_V16I16_V8F32 : VOPProfile <[v8f32, v16i16, v16i16, v8f32]>;
+def VOP_V16F16_V16F16_V16F16_V16F16 : VOPProfile <[v16f16, v16f16, v16f16, v16f16]>;
+def VOP_V16I16_V16I16_V16I16_V16I16 : VOPProfile <[v16i16, v16i16, v16i16, v16i16]>;
 def VOP_V8I32_V4I32_V4I32_V8I32 : VOPProfile <[v8i32, v4i32, v4i32, v8i32]>;
 def VOP_V8I32_V2I32_V2I32_V8I32 : VOPProfile <[v8i32, v2i32, v2i32, v8i32]>;
-def VOP_V8I32_V8I32_V8I32_V8I32 : VOPProfile <[v8i32, v8i32, v8i32, v8i32]>;
 
-def VOP_V4F32_V8F32_V8F32_V4F32 : VOPProfile <[v4f32, v8f32, v8f32, v4f32]>;
-def VOP_V4F32_V8I32_V8I32_V4F32 : VOPProfile <[v4f32, v8i32, v8i32, v4f32]>;
+def VOP_V4F32_V16F16_V16F16_V4F32 : VOPProfile <[v4f32, v16f16, v16f16, v4f32]>;
+def VOP_V4F32_V16I16_V16I16_V4F32 : VOPProfile <[v4f32, v16i16, v16i16, v4f32]>;
+def VOP_V8F16_V16F16_V16F16_V8F16 : VOPProfile <[v8f16, v16f16, v16f16, v8f16]>;
+def VOP_V8I16_V16I16_V16I16_V8I16 : VOPProfile <[v8i16, v16i16, v16i16, v8i16]>;
 def VOP_V4I32_V4I32_V4I32_V4I32 : VOPProfile <[v4i32, v4i32, v4i32, v4i32]>;
 def VOP_V4I32_V2I32_V2I32_V4I32 : VOPProfile <[v4i32, v2i32, v2i32, v4i32]>;
-def VOP_V4I32_V8I32_V8I32_V4I32 : VOPProfile <[v4i32, v8i32, v8i32, v4i32]>;
+
 
 class WMMAType <bits<2> val> {
   bit hasClamp = val{0};
@@ -812,22 +816,24 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
   }
 }
 
+
 let WaveSizePredicate = isWave32 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V8I32_V8I32_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V8F32_V8F32_V8F32_V8F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V8I32_V8I32_V8I32_V8I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
   defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
   defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp>;
 }
 
 let WaveSizePredicate = isWave64 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V8I32_V8I32_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V4F32_V8F32_V8F32_V4F32, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V4I32_V8I32_V8I32_V4I32, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
   defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
   defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 6b8c6922..6538495 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
-declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
-declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f32_16x16x16_f16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -20,14 +20,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
   store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -37,14 +37,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
   store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -54,12 +54,12 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
+  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
+  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -69,14 +69,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
+  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
+  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -86,12 +86,12 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
+  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -101,8 +101,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
+  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index 864e9bb..51b5d6c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
-declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
-declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
-declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
-declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f32_16x16x16_f16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -18,14 +18,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
   store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -33,14 +33,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
   store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -48,12 +48,12 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -61,14 +61,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -76,12 +76,12 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
+  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -89,8 +89,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
+  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index 2261456..c63a634 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
-declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
-declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f32_16x16x16_f16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -20,14 +20,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
   store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -37,14 +37,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
   store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -54,12 +54,12 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
+  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
+  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -69,14 +69,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
+  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
+  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
@@ -86,12 +86,12 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
+  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:23], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -101,8 +101,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
+  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
+  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index a2240f2..6db0205 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
-declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
-declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
-declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
-declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f32_16x16x16_f16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -18,14 +18,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
   store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -33,14 +33,14 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
   store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -48,12 +48,12 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -61,14 +61,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
@@ -76,12 +76,12 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
+  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[16:19], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -89,8 +89,8 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
+  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
index 44af8f4..4ef413d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_32.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W32
 
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>)
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x float>)
-declare <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float> , <8 x float>, i1 immarg)
-declare <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32> , <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -20,7 +20,7 @@ declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 im
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
 ; W32-LABEL: test_wmma_f32_16x16x16_f16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -34,8 +34,8 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C)
-  %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
+  %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x float> %C)
   store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
   store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
   ret void
@@ -43,7 +43,7 @@ bb:
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
 ; W32-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f32_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -57,8 +57,8 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x float> %C)
-  %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
+  %res2 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x float> %C)
   store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
   store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
   ret void
@@ -66,7 +66,7 @@ bb:
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -80,14 +80,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 0)
-  %res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 0)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
-  store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
+  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 0)
+  %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 0)
+  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
+  store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <8 x float> %C, <8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <16 x half> %C, <16 x half> addrspace(1)* %out, <16 x half> addrspace(1)* %out2) {
 ; W32-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -101,16 +101,16 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <8 x float> %C, i1 1)
-  %res2 = call <8 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <8 x float> %C, i1 1)
-  store <8 x float> %res, <8 x float> addrspace(1)* %out, align 32
-  store <8 x float> %res2, <8 x float> addrspace(1)* %out2, align 32
+  %res = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 1)
+  %res2 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <16 x half> %C, i1 1)
+  store <16 x half> %res, <16 x half> addrspace(1)* %out, align 32
+  store <16 x half> %res2, <16 x half> addrspace(1)* %out2, align 32
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23]
@@ -124,14 +124,14 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 0)
-  %res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 0)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 0)
+  %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 0)
+  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
+  store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, <8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, <16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %out2) {
 ; W32-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W32:       ; %bb.0: ; %bb
 ; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[28:35], v[0:7], v[8:15], v[16:23] op_sel:[0,0,1]
@@ -145,10 +145,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
 ; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W32-NEXT:    s_endpgm
 bb:
-  %res = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, i1 1)
-  %res2 = call <8 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <8 x i32> %C, i1 1)
-  store <8 x i32> %res, <8 x i32> addrspace(1)* %out, align 32
-  store <8 x i32> %res2, <8 x i32> addrspace(1)* %out2, align 32
+  %res = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 1)
+  %res2 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <16 x i16> %C, i1 1)
+  store <16 x i16> %res, <16 x i16> addrspace(1)* %out, align 32
+  store <16 x i16> %res2, <16 x i16> addrspace(1)* %out2, align 32
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
index e02cbcb..3ce95a7 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/wmma_multiple_64.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=W64
 
-declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>)
-declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x float>)
-declare <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float>, <8 x float>, <4 x float>, i1 immarg)
-declare <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32>, <8 x i32>, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -20,7 +20,7 @@ declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 im
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
 ; W64-LABEL: test_wmma_f32_16x16x16_f16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -30,8 +30,8 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x float> %A, <8 x float> %B
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C)
-  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %A, <16 x half> %B, <4 x float> %C)
+  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half> %B, <16 x half> %B, <4 x float> %C)
   store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
   store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
   ret void
@@ -39,7 +39,7 @@ bb:
 
 ; @llvm.amdgcn.wmma.f32.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
 ; W64-LABEL: test_wmma_f32_16x16x16_bf16:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f32_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -49,8 +49,8 @@ define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i32> %A, <8 x i32> %B, <
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x float> %C)
-  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <4 x float> %C)
+  %res2 = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <4 x float> %C)
   store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
   store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
   ret void
@@ -58,7 +58,7 @@ bb:
 
 ; @llvm.amdgcn.wmma.f16.16x16x16.f16
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -68,14 +68,14 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_lo(<8 x float> %A, <8 x float>
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 0)
-  %res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 0)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
-  store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 0)
+  %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
+  store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float> %B, <4 x float> %C, <4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<16 x half> %A, <16 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %out2) {
 ; W64-LABEL: test_wmma_f16_16x16x16_f16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -85,16 +85,16 @@ define amdgpu_ps void @test_wmma_f16_16x16x16_f16_hi(<8 x float> %A, <8 x float>
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %A, <8 x float> %B, <4 x float> %C, i1 1)
-  %res2 = call <4 x float> @llvm.amdgcn.wmma.f16.16x16x16.f16(<8 x float> %B, <8 x float> %B, <4 x float> %C, i1 1)
-  store <4 x float> %res, <4 x float> addrspace(1)* %out, align 16
-  store <4 x float> %res2, <4 x float> addrspace(1)* %out2, align 16
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, i1 1)
+  %res2 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %B, <16 x half> %B, <8 x half> %C, i1 1)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out, align 16
+  store <8 x half> %res2, <8 x half> addrspace(1)* %out2, align 16
   ret void
 }
 
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_lo:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19]
@@ -104,14 +104,14 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<8 x i32> %A, <8 x i32> %
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 0)
-  %res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 0)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 0)
+  %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 0)
+  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
+  store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
   ret void
 }
 
-define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, <4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %out2) {
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, <8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %out2) {
 ; W64-LABEL: test_wmma_bf16_16x16x16_bf16_hi:
 ; W64:       ; %bb.0: ; %bb
 ; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[24:27], v[0:7], v[8:15], v[16:19] op_sel:[0,0,1]
@@ -121,10 +121,10 @@ define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_hi(<8 x i32> %A, <8 x i32> %
 ; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; W64-NEXT:    s_endpgm
 bb:
-  %res = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %A, <8 x i32> %B, <4 x i32> %C, i1 1)
-  %res2 = call <4 x i32> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x i32> %B, <8 x i32> %B, <4 x i32> %C, i1 1)
-  store <4 x i32> %res, <4 x i32> addrspace(1)* %out, align 16
-  store <4 x i32> %res2, <4 x i32> addrspace(1)* %out2, align 16
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, i1 1)
+  %res2 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %B, <16 x i16> %B, <8 x i16> %C, i1 1)
+  store <8 x i16> %res, <8 x i16> addrspace(1)* %out, align 16
+  store <8 x i16> %res2, <8 x i16> addrspace(1)* %out2, align 16
   ret void
 }