From 59691dc8740c7eada7fcf5552e0d2377780c6fb7 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 18 Sep 2020 13:20:00 -0700 Subject: [PATCH] [AMDGPU] Make ds fp atomics overloadable Differential Revision: https://reviews.llvm.org/D87947 --- clang/lib/CodeGen/CGBuiltin.cpp | 26 ++++++++++++++++++++++++++ clang/test/CodeGenCUDA/builtins-amdgcn.cu | 2 +- clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl | 6 +++--- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 +++++++-------- llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll | 24 ++++++++++++------------ 5 files changed, 49 insertions(+), 24 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 3c7f13a..92c537f 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14746,6 +14746,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); return Builder.CreateCall(F, { Src0, Builder.getFalse() }); } + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: { + Intrinsic::ID Intrin; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + Intrin = Intrinsic::amdgcn_ds_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + Intrin = Intrinsic::amdgcn_ds_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: + Intrin = Intrinsic::amdgcn_ds_fmax; + break; + } + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); + llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); + llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() }); + llvm::FunctionType *FTy = F->getFunctionType(); + llvm::Type *PTy = FTy->getParamType(0); + Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy); + return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 }); + } case AMDGPU::BI__builtin_amdgcn_read_exec: { CallInst *CI = cast( EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec")); diff --git a/clang/test/CodeGenCUDA/builtins-amdgcn.cu b/clang/test/CodeGenCUDA/builtins-amdgcn.cu index c10eae9..1c3a790 100644 --- a/clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ b/clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -10,7 +10,7 @@ __global__ void use_dispatch_ptr(int* out) { } // CHECK-LABEL: @_Z12test_ds_fmaxf( -// CHECK: call contract float @llvm.amdgcn.ds.fmax(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false) +// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false) __global__ void test_ds_fmax(float src) { __shared__ float shared; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index 5884f84..4408b0432 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -114,19 +114,19 @@ void test_update_dpp(global int* out, int arg1, int arg2) } // CHECK-LABEL: @test_ds_fadd -// CHECK: call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) +// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) void test_ds_faddf(local float *out, float src) { *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmin -// CHECK: call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) +// CHECK: call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) void test_ds_fminf(local float *out, float src) { *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmax -// CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) +// CHECK: call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) void test_ds_fmaxf(local float *out, float src) { *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false); } diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 3df07e8..918ab3e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -397,11 +397,10 @@ class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty], def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; -class AMDGPULDSF32Intrin : - GCCBuiltin, - Intrinsic<[llvm_float_ty], - [LLVMQualPointerType, - llvm_float_ty, +class AMDGPULDSIntrin : + Intrinsic<[llvm_any_ty], + [LLVMQualPointerType, 3>, + LLVMMatchType<0>, llvm_i32_ty, // ordering llvm_i32_ty, // scope llvm_i1_ty], // isVolatile @@ -446,9 +445,9 @@ def int_amdgcn_ds_ordered_swap : AMDGPUDSOrderedIntrinsic; def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; -def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">; -def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">; -def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">; +def int_amdgcn_ds_fadd : AMDGPULDSIntrin; +def int_amdgcn_ds_fmin : AMDGPULDSIntrin; +def int_amdgcn_ds_fmax : AMDGPULDSIntrin; } // TargetPrefix = "amdgcn" diff --git a/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll b/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll index a33fcf4..0e21f33 100644 --- a/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll +++ b/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -declare float @llvm.amdgcn.ds.fadd(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.ds.fmin(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.ds.fmax(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1) ; GCN-LABEL: {{^}}lds_ds_fadd: ; VI-DAG: s_mov_b32 m0 @@ -19,9 +19,9 @@ define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void } @@ -40,9 +40,9 @@ define amdgpu_kernel void @lds_ds_fmin(float addrspace(1)* %out, float addrspace %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void } @@ -61,9 +61,9 @@ define amdgpu_kernel void @lds_ds_fmax(float addrspace(1)* %out, float addrspace %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void } -- 2.7.4