From 7b82b4bddbf4af617491b90ecd8b9b7d54a3a133 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Tue, 2 May 2017 16:57:44 +0000 Subject: [PATCH] AMDGPU: Make intrinsics speculatable llvm-svn: 301937 --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 156 +++++++++++++-------- llvm/lib/Target/AMDGPU/R600Intrinsics.td | 2 +- .../CodeGen/AMDGPU/annotate-kernel-features-hsa.ll | 4 +- llvm/test/CodeGen/AMDGPU/zext-lid.ll | 39 +++--- .../Transforms/InstCombine/amdgcn-intrinsics.ll | 6 +- 5 files changed, 126 insertions(+), 81 deletions(-) diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 1249c1f..d7413fe 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -12,10 +12,10 @@ //===----------------------------------------------------------------------===// class AMDGPUReadPreloadRegisterIntrinsic - : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; class AMDGPUReadPreloadRegisterIntrinsicNamed - : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>, GCCBuiltin; + : Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>, GCCBuiltin; let TargetPrefix = "r600" in { @@ -47,7 +47,8 @@ def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">, // AS 7 is PARAM_I_ADDRESS, used for kernel arguments def int_r600_implicitarg_ptr : GCCBuiltin<"__builtin_r600_implicitarg_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_r600_rat_store_typed : // 1st parameter: Data @@ -57,15 +58,15 @@ def int_r600_rat_store_typed : GCCBuiltin<"__builtin_r600_rat_store_typed">; def int_r600_recipsqrt_ieee : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_r600_recipsqrt_clamped : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_r600_cube : Intrinsic< - [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem] + [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] >; } // End TargetPrefix = "r600" @@ -82,31 +83,36 @@ defm int_amdgcn_workgroup_id : AMDGPUReadPreloadRegisterIntrinsic_xyz_named def int_amdgcn_dispatch_ptr : GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_queue_ptr : GCCBuiltin<"__builtin_amdgcn_queue_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_kernarg_segment_ptr : GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_implicitarg_ptr : GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_groupstaticsize : GCCBuiltin<"__builtin_amdgcn_groupstaticsize">, - Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_dispatch_id : GCCBuiltin<"__builtin_amdgcn_dispatch_id">, - Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>; + Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_implicit_buffer_ptr : GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">, - Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; + Intrinsic<[LLVMQualPointerType], [], + [IntrNoMem, IntrSpeculatable]>; // Set EXEC to the 64-bit value given. // This is always moved to the beginning of the basic block. @@ -150,115 +156,129 @@ def int_amdgcn_div_scale : Intrinsic< // second. (0 = first, 1 = second). [llvm_anyfloat_ty, llvm_i1_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], - [IntrNoMem] + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_div_fmas : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, llvm_i1_ty], - [IntrNoMem] + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_div_fixup : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], - [IntrNoMem] + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_trig_preop : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_sin : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cos : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_log_clamp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_fmul_legacy : GCCBuiltin<"__builtin_amdgcn_fmul_legacy">, - Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem] + Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rcp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rcp_legacy : GCCBuiltin<"__builtin_amdgcn_rcp_legacy">, - Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem] + Intrinsic<[llvm_float_ty], [llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rsq : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rsq_legacy : GCCBuiltin<"__builtin_amdgcn_rsq_legacy">, Intrinsic< - [llvm_float_ty], [llvm_float_ty], [IntrNoMem] + [llvm_float_ty], [llvm_float_ty], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_rsq_clamp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; def int_amdgcn_ldexp : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_frexp_mant : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_frexp_exp : Intrinsic< - [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem] + [llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem, IntrSpeculatable] >; // v_fract is buggy on SI/CI. It mishandles infinities, may return 1.0 // and always uses rtz, so is not suitable for implementing the OpenCL // fract function. It should be ok on VI. def int_amdgcn_fract : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cvt_pkrtz : Intrinsic< - [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_class : Intrinsic< - [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem] + [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_fmed3 : GCCBuiltin<"__builtin_amdgcn_fmed3">, Intrinsic<[llvm_anyfloat_ty], - [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem] + [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubeid : GCCBuiltin<"__builtin_amdgcn_cubeid">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubema : GCCBuiltin<"__builtin_amdgcn_cubema">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubesc : GCCBuiltin<"__builtin_amdgcn_cubesc">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_cubetc : GCCBuiltin<"__builtin_amdgcn_cubetc">, Intrinsic<[llvm_float_ty], - [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty, llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; // v_ffbh_i32, as opposed to v_ffbh_u32. For v_ffbh_u32, llvm.ctlz // should be used. def int_amdgcn_sffbh : - Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>; + Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], + [IntrNoMem, IntrSpeculatable] +>; // Fields should mirror atomicrmw @@ -542,7 +562,9 @@ def int_amdgcn_s_decperflevel : def int_amdgcn_s_getreg : GCCBuiltin<"__builtin_amdgcn_s_getreg">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrReadMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty], + [IntrReadMem, IntrSpeculatable] +>; // __builtin_amdgcn_interp_mov , , , // param values: 0 = P10, 1 = P20, 2 = P0 @@ -550,23 +572,24 @@ def int_amdgcn_interp_mov : GCCBuiltin<"__builtin_amdgcn_interp_mov">, Intrinsic<[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; + [IntrNoMem, IntrSpeculatable]>; // __builtin_amdgcn_interp_p1 , , , +// This intrinsic reads from lds, but the memory values are constant, +// so it behaves like IntrNoMem. def int_amdgcn_interp_p1 : GCCBuiltin<"__builtin_amdgcn_interp_p1">, Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; // This intrinsic reads from lds, but the memory - // values are constant, so it behaves like IntrNoMem. + [IntrNoMem, IntrSpeculatable]>; // __builtin_amdgcn_interp_p2 , , , , def int_amdgcn_interp_p2 : GCCBuiltin<"__builtin_amdgcn_interp_p2">, Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], - [IntrNoMem]>; // See int_amdgcn_v_interp_p1 for why this is - // IntrNoMem. + [IntrNoMem, IntrSpeculatable]>; + // See int_amdgcn_v_interp_p1 for why this is IntrNoMem. // Pixel shaders only: whether the current pixel is live (i.e. not a helper // invocation for derivative computation). @@ -589,48 +612,68 @@ def int_amdgcn_ds_swizzle : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem] + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty], - [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem] + [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] >; def int_amdgcn_lerp : GCCBuiltin<"__builtin_amdgcn_lerp">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_sad_u8 : GCCBuiltin<"__builtin_amdgcn_sad_u8">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_msad_u8 : GCCBuiltin<"__builtin_amdgcn_msad_u8">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_sad_hi_u8 : GCCBuiltin<"__builtin_amdgcn_sad_hi_u8">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_sad_u16 : GCCBuiltin<"__builtin_amdgcn_sad_u16">, - Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_qsad_pk_u16_u8 : GCCBuiltin<"__builtin_amdgcn_qsad_pk_u16_u8">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; + Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_mqsad_pk_u16_u8 : GCCBuiltin<"__builtin_amdgcn_mqsad_pk_u16_u8">, - Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], [IntrNoMem]>; + Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i64_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_mqsad_u32_u8 : GCCBuiltin<"__builtin_amdgcn_mqsad_u32_u8">, - Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i32_ty, llvm_v4i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_cvt_pk_u8_f32 : GCCBuiltin<"__builtin_amdgcn_cvt_pk_u8_f32">, - Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; + Intrinsic<[llvm_i32_ty], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrSpeculatable] +>; def int_amdgcn_icmp : Intrinsic<[llvm_i64_ty], [llvm_anyint_ty, LLVMMatchType<0>, llvm_i32_ty], @@ -731,6 +774,7 @@ def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>; // Emit 2.5 ulp, no denormal division. Should only be inserted by // pass based on !fpmath metadata. def int_amdgcn_fdiv_fast : Intrinsic< - [llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem] + [llvm_float_ty], [llvm_float_ty, llvm_float_ty], + [IntrNoMem, IntrSpeculatable] >; } diff --git a/llvm/lib/Target/AMDGPU/R600Intrinsics.td b/llvm/lib/Target/AMDGPU/R600Intrinsics.td index a5310e9..4c9e1e8 100644 --- a/llvm/lib/Target/AMDGPU/R600Intrinsics.td +++ b/llvm/lib/Target/AMDGPU/R600Intrinsics.td @@ -61,7 +61,7 @@ def int_r600_ddx : TextureIntrinsicFloatInput; def int_r600_ddy : TextureIntrinsicFloatInput; def int_r600_dot4 : Intrinsic<[llvm_float_ty], - [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem] + [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] >; } // End TargetPrefix = "r600", isTarget = 1 diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll index e2620ce..f7461b9 100644 --- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll +++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll @@ -221,10 +221,10 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* ret void } -attributes #0 = { nounwind readnone } +attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind } -; HSA: attributes #0 = { nounwind readnone } +; HSA: attributes #0 = { nounwind readnone speculatable } ; HSA: attributes #1 = { nounwind } ; HSA: attributes #2 = { nounwind "amdgpu-work-group-id-y" } ; HSA: attributes #3 = { nounwind "amdgpu-work-group-id-z" } diff --git a/llvm/test/CodeGen/AMDGPU/zext-lid.ll b/llvm/test/CodeGen/AMDGPU/zext-lid.ll index 8eeff53..066f292 100644 --- a/llvm/test/CodeGen/AMDGPU/zext-lid.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-lid.ll @@ -4,19 +4,19 @@ ; CHECK-NOT: and_b32 ; OPT-LABEL: @zext_grp_size_128 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !0 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !0 define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 { bb: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 127 store i32 %tmp1, i32 addrspace(1)* %arg, align 4 - %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp3 = and i32 %tmp2, 127 %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 - %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() %tmp6 = and i32 %tmp5, 127 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 @@ -24,19 +24,19 @@ bb: } ; OPT-LABEL: @zext_grp_size_32x4x1 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !2 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !3 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !4 define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 { bb: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 31 store i32 %tmp1, i32 addrspace(1)* %arg, align 4 - %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp3 = and i32 %tmp2, 3 %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 - %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() %tmp6 = and i32 %tmp5, 1 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 @@ -44,19 +44,19 @@ bb: } ; OPT-LABEL: @zext_grp_size_512 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5 -; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.x(), !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.y(), !range !5 +; OPT: tail call i32 @llvm.amdgcn.workitem.id.z(), !range !5 define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 { bb: - %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2 + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = and i32 %tmp, 65535 store i32 %tmp1, i32 addrspace(1)* %arg, align 4 - %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2 + %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() %tmp3 = and i32 %tmp2, 65535 %tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4 - %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2 + %tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() %tmp6 = and i32 %tmp5, 65535 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4 @@ -71,7 +71,8 @@ declare i32 @llvm.amdgcn.workitem.id.z() #2 attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" } attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" } -attributes #2 = { nounwind readnone } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { nounwind readnone } !0 = !{i32 32, i32 4, i32 1} diff --git a/llvm/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/llvm/test/Transforms/InstCombine/amdgcn-intrinsics.ll index 357085f..1901997 100644 --- a/llvm/test/Transforms/InstCombine/amdgcn-intrinsics.ll +++ b/llvm/test/Transforms/InstCombine/amdgcn-intrinsics.ll @@ -1259,7 +1259,7 @@ define i64 @icmp_constant_inputs_false() { } ; CHECK-LABEL: @icmp_constant_inputs_true( -; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #4 +; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5 define i64 @icmp_constant_inputs_true() { %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 34) ret i64 %result @@ -1524,7 +1524,7 @@ define i64 @fcmp_constant_inputs_false() { } ; CHECK-LABEL: @fcmp_constant_inputs_true( -; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #4 +; CHECK: %result = call i64 @llvm.read_register.i64(metadata !0) #5 define i64 @fcmp_constant_inputs_true() { %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 4) ret i64 %result @@ -1537,4 +1537,4 @@ define i64 @fcmp_constant_to_rhs_olt(float %x) { ret i64 %result } -; CHECK: attributes #4 = { convergent } +; CHECK: attributes #5 = { convergent } -- 2.7.4