[IntrReadMem, IntrWillReturn]>;
//===----------------------------------------------------------------------===//
+// GFX11 Intrinsics
+//===----------------------------------------------------------------------===//
+
+// llvm.amdgcn.permlane64 <src0>
+def int_amdgcn_permlane64 :
+ Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+ [IntrNoMem, IntrConvergent, IntrWillReturn]>;
+
+//===----------------------------------------------------------------------===//
// Deep learning intrinsics.
//===----------------------------------------------------------------------===//
if (ST->isWave32())
return V;
+ if (ST->hasPermLane64()) {
+ // Reduce across the upper and lower 32 lanes.
+ return buildNonAtomicBinOp(
+ B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
+ }
+
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
return IC.replaceOperand(II, 0, UndefValue::get(VDstIn->getType()));
}
+ case Intrinsic::amdgcn_permlane64:
+ // A constant value is trivially uniform.
+ if (Constant *C = dyn_cast<Constant>(II.getArgOperand(0))) {
+ return IC.replaceInstUsesWith(II, C);
+ }
+ break;
case Intrinsic::amdgcn_readfirstlane:
case Intrinsic::amdgcn_readlane: {
// A constant value is trivially uniform.
case Intrinsic::amdgcn_wqm:
case Intrinsic::amdgcn_softwqm:
case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_permlane64:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_kernarg_segment_ptr:
case Intrinsic::amdgcn_s_getpc:
let SubtargetPredicate = isGFX11Plus in {
// Restrict src0 to be VGPR
def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
- [],
+ getVOP1Pat64<int_amdgcn_permlane64,
+ VOP_MOVRELS>.ret,
/*VOP1Only=*/ 1>;
defm V_NOT_B16 : VOP1Inst<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst<"v_cvt_i32_i16", VOP_I32_I16>;
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164-NEXT: v_permlane64_b32 v2, v1
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT: v_readlane_b32 s2, v1, 0
-; GFX1164-NEXT: v_readlane_b32 s3, v1, 32
+; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: s_add_i32 s0, s2, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-NEXT: v_mov_b32_e32 v0, v1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-NEXT: s_cbranch_execz .LBB3_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v3, s0
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT: ds_add_u32 v0, v3
+; GFX1164-NEXT: ds_add_u32 v3, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB3_2:
; GFX1164-NEXT: v_mov_b32_e32 v2, v1
; GFX1164-NEXT: v_permlanex16_b32 v2, v2, -1, -1
; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; GFX1164-NEXT: v_permlane64_b32 v2, v1
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
; GFX1164-NEXT: s_or_saveexec_b64 s[0:1], -1
-; GFX1164-NEXT: v_readlane_b32 s2, v1, 0
-; GFX1164-NEXT: v_readlane_b32 s3, v1, 32
+; GFX1164-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1164-NEXT: s_mov_b64 exec, s[0:1]
-; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-; GFX1164-NEXT: s_add_i32 s0, s2, s3
-; GFX1164-NEXT: s_mov_b64 s[2:3], exec
-; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0
+; GFX1164-NEXT: v_mov_b32_e32 v0, v1
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v3
; GFX1164-NEXT: s_cbranch_execz .LBB10_2
; GFX1164-NEXT: ; %bb.1:
-; GFX1164-NEXT: v_mov_b32_e32 v0, 0
-; GFX1164-NEXT: v_mov_b32_e32 v3, s0
+; GFX1164-NEXT: v_mov_b32_e32 v3, 0
; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX1164-NEXT: ds_sub_u32 v0, v3
+; GFX1164-NEXT: ds_sub_u32 v3, v0
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_gl0_inv
; GFX1164-NEXT: .LBB10_2:
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+
+declare i32 @llvm.amdgcn.permlane64(i32)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @test_s(i32 addrspace(1)* %out, i32 %src0) {
+; GFX11-LABEL: test_s:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_permlane64_b32 v0, v0
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %src0)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @test_i(i32 addrspace(1)* %out) {
+; GFX11-LABEL: test_i:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT: v_mov_b32_e32 v0, 0x63
+; GFX11-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-NEXT: v_permlane64_b32 v0, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.permlane64(i32 99)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
+
+define amdgpu_kernel void @test_v(i32 addrspace(1)* %out, i32 %src0) #1 {
+; GFX11-SDAG-LABEL: test_v:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT: v_permlane64_b32 v0, v0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: test_v:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-GISEL-NEXT: v_permlane64_b32 v0, v0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-GISEL-NEXT: s_endpgm
+ %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+ %v = call i32 @llvm.amdgcn.permlane64(i32 %tidx)
+ store i32 %v, i32 addrspace(1)* %out
+ ret void
+}
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=amdgcn-amd-amdhsa -instcombine -S | FileCheck %s
+
+; Optimize the intrinsic away if the argument is uniform.
+define i32 @test_constant() {
+; CHECK-LABEL: @test_constant(
+; CHECK-NEXT: ret i32 99
+;
+ %call = call i32 @llvm.amdgcn.permlane64(i32 99)
+ ret i32 %call
+}
+
+declare i32 @llvm.amdgcn.permlane64(i32)