[AMDGPU] V_LDEXP_F16 encoding fix and doc update.

author Joe Nash <Joseph.Nash@amd.com>

Tue, 18 Oct 2022 18:59:19 +0000 (14:59 -0400)

committer Joe Nash <Joseph.Nash@amd.com>

Wed, 19 Oct 2022 13:52:53 +0000 (09:52 -0400)
author Joe Nash <Joseph.Nash@amd.com>
Tue, 18 Oct 2022 18:59:19 +0000 (14:59 -0400)
committer Joe Nash <Joseph.Nash@amd.com>
Wed, 19 Oct 2022 13:52:53 +0000 (09:52 -0400)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td

index 2c63d2e..8f05eb1 100644 (file)
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -343,6 +343,7 @@ def int_amdgcn_rsq_legacy :  ClangBuiltin<"__builtin_amdgcn_rsq_legacy">,
  def int_amdgcn_rsq_clamp : Intrinsic<
    [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
  
+// For int_amdgcn_ldexp_f16, only the low 16 bits of the i32 src1 operand will used.
  def int_amdgcn_ldexp : Intrinsic<
    [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty],
    [IntrNoMem, IntrSpeculatable, IntrWillReturn]
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td

index a04eb98..c3d66f4 100644 (file)
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -832,9 +832,20 @@ def :  divergent_i64_BinOp <xor, V_XOR_B32_e64>;
  // 16-Bit Operand Instructions
  //===----------------------------------------------------------------------===//
  
+def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_I32> {
+  // The ldexp.f16 intrinsic expects a i32 src1 operand, though the hardware
+  // encoding treats src1 as an f16
+  let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
+  let Src1DPP = VGPR_32_Lo128;
+  let Src1ModDPP = IntT16VRegInputMods;
+}
+
  let isReMaterializable = 1 in {
  let FPDPRounding = 1 in {
-defm V_LDEXP_F16 : VOP2Inst_t16 <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+  let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts]  in
+    defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+  let SubtargetPredicate = HasTrue16BitInsts in
+    defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16, AMDGPUldexp>;
  } // End FPDPRounding = 1
  // FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
  defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll

index 7068f45..0d17279 100644 (file)
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -1,13 +1,79 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
  
  declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
  
-; GCN-LABEL: {{^}}ldexp_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
-; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]]
-; GCN: buffer_store_short v[[R_F16]]
  define amdgpu_kernel void @ldexp_f16(
+; VI-LABEL: ldexp_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s14, s2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s12, s6
+; VI-NEXT:    s_mov_b32 s13, s7
+; VI-NEXT:    s_mov_b32 s15, s3
+; VI-NEXT:    s_mov_b32 s10, s2
+; VI-NEXT:    s_mov_b32 s11, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ldexp_f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX10-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x34
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s14, s2
+; GFX10-NEXT:    s_mov_b32 s15, s3
+; GFX10-NEXT:    s_mov_b32 s10, s2
+; GFX10-NEXT:    s_mov_b32 s11, s3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s12, s6
+; GFX10-NEXT:    s_mov_b32 s13, s7
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[12:15], 0
+; GFX10-NEXT:    buffer_load_dword v1, off, s[8:11], 0
+; GFX10-NEXT:    s_mov_b32 s0, s4
+; GFX10-NEXT:    s_mov_b32 s1, s5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX10-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ldexp_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT:    s_mov_b32 s10, -1
+; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s14, s10
+; GFX11-NEXT:    s_mov_b32 s15, s11
+; GFX11-NEXT:    s_mov_b32 s2, s10
+; GFX11-NEXT:    s_mov_b32 s3, s11
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s12, s6
+; GFX11-NEXT:    s_mov_b32 s13, s7
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[12:15], 0
+; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s8, s4
+; GFX11-NEXT:    s_mov_b32 s9, s5
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
      half addrspace(1)* %r,
      half addrspace(1)* %a,
      i32 addrspace(1)* %b) {
@@ -18,11 +84,61 @@ define amdgpu_kernel void @ldexp_f16(
    ret void
  }
  
-; GCN-LABEL: {{^}}ldexp_f16_imm_a
-; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
-; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
-; GCN: buffer_store_short v[[R_F16]]
  define amdgpu_kernel void @ldexp_f16_imm_a(
+; VI-LABEL: ldexp_f16_imm_a:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ldexp_f16_e32 v0, 2.0, v0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ldexp_f16_imm_a:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_ldexp_f16_e32 v0, 2.0, v0
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ldexp_f16_imm_a:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_ldexp_f16_e32 v0, 2.0, v0
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
      half addrspace(1)* %r,
      i32 addrspace(1)* %b) {
    %b.val = load i32, i32 addrspace(1)* %b
@@ -31,11 +147,61 @@ define amdgpu_kernel void @ldexp_f16_imm_a(
    ret void
  }
  
-; GCN-LABEL: {{^}}ldexp_f16_imm_b
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}}
-; GCN: buffer_store_short v[[R_F16]]
  define amdgpu_kernel void @ldexp_f16_imm_b(
+; VI-LABEL: ldexp_f16_imm_b:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s10, s6
+; VI-NEXT:    s_mov_b32 s11, s7
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s8, s2
+; VI-NEXT:    s_mov_b32 s9, s3
+; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_ldexp_f16_e64 v0, v0, 2
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX10-LABEL: ldexp_f16_imm_b:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX10-NEXT:    s_mov_b32 s10, s6
+; GFX10-NEXT:    s_mov_b32 s11, s7
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s8, s2
+; GFX10-NEXT:    s_mov_b32 s9, s3
+; GFX10-NEXT:    s_mov_b32 s4, s0
+; GFX10-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GFX10-NEXT:    s_mov_b32 s5, s1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_ldexp_f16_e64 v0, v0, 2
+; GFX10-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ldexp_f16_imm_b:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s10, s6
+; GFX11-NEXT:    s_mov_b32 s11, s7
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s8, s2
+; GFX11-NEXT:    s_mov_b32 s9, s3
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_ldexp_f16_e64 v0, v0, 2
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
      half addrspace(1)* %r,
      half addrspace(1)* %a) {
    %a.val = load half, half addrspace(1)* %a
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s

index c2e6974..66cb4ca 100644 (file)
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s
@@ -73,6 +73,9 @@ v_fmac_f16_e32 v5, v1, v255
  v_fmamk_f16_e32 v5, v1, 0xfe0b, v255
  // GFX11: error: operands are not valid for this GPU or mode
  
+v_ldexp_f16_e32 v5, v1, v255
+// GFX11: error: operands are not valid for this GPU or mode
+
  v_max_f16_e32 v5, v1, v255
  // GFX11: error: operands are not valid for this GPU or mode
  
@@ -142,6 +145,9 @@ v_add_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
  v_fmac_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
  // GFX11: error: operands are not valid for this GPU or mode
  
+v_ldexp_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
+// GFX11: error: operands are not valid for this GPU or mode
+
  v_max_f16_dpp v5, v1, v255 quad_perm:[3,2,1,0]
  // GFX11: error: operands are not valid for this GPU or mode
  
@@ -211,6 +217,9 @@ v_add_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
  v_fmac_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
  // GFX11: error: operands are not valid for this GPU or mode
  
+v_ldexp_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: error: operands are not valid for this GPU or mode
+
  v_max_f16_dpp v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
  // GFX11: error: operands are not valid for this GPU or mode
  
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s

index c5f81d5..876b7b8 100644 (file)
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s
@@ -55,6 +55,9 @@ v_add_f16 v5, v1, v255
  v_fmac_f16 v5, v1, v255
  // GFX11: v_fmac_f16_e64
  
+v_ldexp_f16 v5, v1, v255
+// GFX11: v_ldexp_f16_e64
+
  v_max_f16 v5, v1, v255
  // GFX11: v_max_f16_e64
  
@@ -115,6 +118,9 @@ v_subrev_f16 v5, v255, v2 quad_perm:[3,2,1,0]
  v_add_f16 v5, v1, v255 quad_perm:[3,2,1,0]
  // GFX11: v_add_f16_e64
  
+v_ldexp_f16 v5, v1, v255 quad_perm:[3,2,1,0]
+// GFX11: v_ldexp_f16_e64
+
  v_max_f16 v5, v1, v255 quad_perm:[3,2,1,0]
  // GFX11: v_max_f16_e64
  
@@ -175,6 +181,9 @@ v_subrev_f16 v5, v255, v2 dpp8:[7,6,5,4,3,2,1,0]
  v_add_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
  // GFX11: v_add_f16_e64
  
+v_ldexp_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: v_ldexp_f16_e64
+
  v_max_f16 v5, v1, v255 dpp8:[7,6,5,4,3,2,1,0]
  // GFX11: v_max_f16_e64
author	Joe Nash <Joseph.Nash@amd.com>
	Tue, 18 Oct 2022 18:59:19 +0000 (14:59 -0400)
committer	Joe Nash <Joseph.Nash@amd.com>
	Wed, 19 Oct 2022 13:52:53 +0000 (09:52 -0400)
llvm/include/llvm/IR/IntrinsicsAMDGPU.td		patch \| blob \| history
llvm/lib/Target/AMDGPU/VOP2Instructions.td		patch \| blob \| history
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll		patch \| blob \| history
llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_err.s		patch \| blob \| history
llvm/test/MC/AMDGPU/gfx11_asm_vop2_t16_promote.s		patch \| blob \| history