From f226e28a880f8e40b1bfd4c77b9768a667372d22 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Mon, 17 May 2021 09:23:44 -0700 Subject: [PATCH] [Clang][NVPTX] Add NVPTX intrinsics and builtins for CUDA PTX redux.sync instructions Adds NVPTX builtins and intrinsics for the CUDA PTX `redux.sync` instructions for `sm_80` architecture or newer. PTX ISA description of `redux.sync`: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-redux-sync Authored-by: Steffen Larsen Differential Revision: https://reviews.llvm.org/D100124 --- clang/include/clang/Basic/BuiltinsNVPTX.def | 10 +++++ clang/test/CodeGenCUDA/redux-builtins.cu | 47 +++++++++++++++++++++ llvm/include/llvm/IR/IntrinsicsNVVM.td | 43 +++++++++++++++++++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 16 +++++++ llvm/test/CodeGen/NVPTX/redux-sync.ll | 65 +++++++++++++++++++++++++++++ 5 files changed, 181 insertions(+) create mode 100644 clang/test/CodeGenCUDA/redux-builtins.cu create mode 100644 llvm/test/CodeGen/NVPTX/redux-sync.ll diff --git a/clang/include/clang/Basic/BuiltinsNVPTX.def b/clang/include/clang/Basic/BuiltinsNVPTX.def index 3feea85..98f3c65 100644 --- a/clang/include/clang/Basic/BuiltinsNVPTX.def +++ b/clang/include/clang/Basic/BuiltinsNVPTX.def @@ -456,6 +456,16 @@ TARGET_BUILTIN(__nvvm_match_any_sync_i64, "WiUiWi", "", PTX60) TARGET_BUILTIN(__nvvm_match_all_sync_i32p, "UiUiUii*", "", PTX60) TARGET_BUILTIN(__nvvm_match_all_sync_i64p, "WiUiWii*", "", PTX60) +// Redux +TARGET_BUILTIN(__nvvm_redux_sync_add, "iii", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_redux_sync_min, "iii", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_redux_sync_max, "iii", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_redux_sync_umin, "UiUii", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_redux_sync_umax, "UiUii", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_redux_sync_and, "iii", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_redux_sync_xor, "iii", "", AND(SM_80,PTX70)) +TARGET_BUILTIN(__nvvm_redux_sync_or, "iii", "", AND(SM_80,PTX70)) + // Membar BUILTIN(__nvvm_membar_cta, "v", "") diff --git a/clang/test/CodeGenCUDA/redux-builtins.cu b/clang/test/CodeGenCUDA/redux-builtins.cu new file mode 100644 index 0000000..bdcd00b --- /dev/null +++ b/clang/test/CodeGenCUDA/redux-builtins.cu @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 "-triple" "nvptx-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s +// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" "-target-feature" "+ptx70" "-target-cpu" "sm_80" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s + +// CHECK: define{{.*}} void @_Z6kernelPi(i32* %out) +__attribute__((global)) void kernel(int *out) { + int a = 1; + unsigned int b = 5; + int i = 0; + + out[i++] = __nvvm_redux_sync_add(a, 0xFF); + // CHECK: call i32 @llvm.nvvm.redux.sync.add + + out[i++] = __nvvm_redux_sync_add(b, 0x01); + // CHECK: call i32 @llvm.nvvm.redux.sync.add + + out[i++] = __nvvm_redux_sync_min(a, 0x0F); + // CHECK: call i32 @llvm.nvvm.redux.sync.min + + out[i++] = __nvvm_redux_sync_umin(b, 0xF0); + // CHECK: call i32 @llvm.nvvm.redux.sync.umin + + out[i++] = __nvvm_redux_sync_max(a, 0xF0); + // CHECK: call i32 @llvm.nvvm.redux.sync.max + + out[i++] = __nvvm_redux_sync_umax(b, 0x0F); + // CHECK: call i32 @llvm.nvvm.redux.sync.umax + + out[i++] = __nvvm_redux_sync_and(a, 0xF0); + // CHECK: call i32 @llvm.nvvm.redux.sync.and + + out[i++] = __nvvm_redux_sync_and(b, 0x0F); + // CHECK: call i32 @llvm.nvvm.redux.sync.and + + out[i++] = __nvvm_redux_sync_xor(a, 0x10); + // CHECK: call i32 @llvm.nvvm.redux.sync.xor + + out[i++] = __nvvm_redux_sync_xor(b, 0x01); + // CHECK: call i32 @llvm.nvvm.redux.sync.xor + + out[i++] = __nvvm_redux_sync_or(a, 0xFF); + // CHECK: call i32 @llvm.nvvm.redux.sync.or + + out[i++] = __nvvm_redux_sync_or(b, 0xFF); + // CHECK: call i32 @llvm.nvvm.redux.sync.or + + // CHECK: ret void +} diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index fa66a4a..71e31b1 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4205,6 +4205,49 @@ def int_nvvm_match_all_sync_i64p : [IntrInaccessibleMemOnly, IntrConvergent], "llvm.nvvm.match.all.sync.i64p">; // +// REDUX.SYNC +// +// redux.sync.min.u32 dst, src, membermask; +def int_nvvm_redux_sync_umin : GCCBuiltin<"__nvvm_redux_sync_umin">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// redux.sync.max.u32 dst, src, membermask; +def int_nvvm_redux_sync_umax : GCCBuiltin<"__nvvm_redux_sync_umax">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// redux.sync.add.s32 dst, src, membermask; +def int_nvvm_redux_sync_add : GCCBuiltin<"__nvvm_redux_sync_add">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// redux.sync.min.s32 dst, src, membermask; +def int_nvvm_redux_sync_min : GCCBuiltin<"__nvvm_redux_sync_min">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// redux.sync.max.s32 dst, src, membermask; +def int_nvvm_redux_sync_max : GCCBuiltin<"__nvvm_redux_sync_max">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// redux.sync.and.b32 dst, src, membermask; +def int_nvvm_redux_sync_and : GCCBuiltin<"__nvvm_redux_sync_and">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// redux.sync.xor.b32 dst, src, membermask; +def int_nvvm_redux_sync_xor : GCCBuiltin<"__nvvm_redux_sync_xor">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// redux.sync.or.b32 dst, src, membermask; +def int_nvvm_redux_sync_or : GCCBuiltin<"__nvvm_redux_sync_or">, + Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], + [IntrConvergent, IntrInaccessibleMemOnly]>; + +// // WMMA instructions // // WMMA.LOAD diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 0f65093..1aaa9f0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -274,6 +274,22 @@ defm MATCH_ALLP_SYNC_32 : MATCH_ALLP_SYNC; +multiclass REDUX_SYNC { + def : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src, Int32Regs:$mask), + "redux.sync." # BinOp # "." # PTXType # " $dst, $src, $mask;", + [(set Int32Regs:$dst, (Intrin Int32Regs:$src, Int32Regs:$mask))]>, + Requires<[hasPTX70, hasSM80]>; +} + +defm REDUX_SYNC_UMIN : REDUX_SYNC<"min", "u32", int_nvvm_redux_sync_umin>; +defm REDUX_SYNC_UMAX : REDUX_SYNC<"max", "u32", int_nvvm_redux_sync_umax>; +defm REDUX_SYNC_ADD : REDUX_SYNC<"add", "s32", int_nvvm_redux_sync_add>; +defm REDUX_SYNC_MIN : REDUX_SYNC<"min", "s32", int_nvvm_redux_sync_min>; +defm REDUX_SYNC_MAX : REDUX_SYNC<"max", "s32", int_nvvm_redux_sync_max>; +defm REDUX_SYNC_AND : REDUX_SYNC<"and", "b32", int_nvvm_redux_sync_and>; +defm REDUX_SYNC_XOR : REDUX_SYNC<"xor", "b32", int_nvvm_redux_sync_xor>; +defm REDUX_SYNC_OR : REDUX_SYNC<"or", "b32", int_nvvm_redux_sync_or>; + } // isConvergent = true //----------------------------------- diff --git a/llvm/test/CodeGen/NVPTX/redux-sync.ll b/llvm/test/CodeGen/NVPTX/redux-sync.ll new file mode 100644 index 0000000..43a304e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/redux-sync.ll @@ -0,0 +1,65 @@ +; RUN: llc < %s -march=nvptx64 -mcpu=sm_80 -mattr=+ptx70 | FileCheck %s + +declare i32 @llvm.nvvm.redux.sync.umin(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_min_u32 +define i32 @redux_sync_min_u32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.min.u32 + %val = call i32 @llvm.nvvm.redux.sync.umin(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.umax(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_max_u32 +define i32 @redux_sync_max_u32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.max.u32 + %val = call i32 @llvm.nvvm.redux.sync.umax(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.add(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_add_s32 +define i32 @redux_sync_add_s32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.add.s32 + %val = call i32 @llvm.nvvm.redux.sync.add(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.min(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_min_s32 +define i32 @redux_sync_min_s32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.min.s32 + %val = call i32 @llvm.nvvm.redux.sync.min(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.max(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_max_s32 +define i32 @redux_sync_max_s32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.max.s32 + %val = call i32 @llvm.nvvm.redux.sync.max(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.and(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_and_b32 +define i32 @redux_sync_and_b32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.and.b32 + %val = call i32 @llvm.nvvm.redux.sync.and(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.xor(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_xor_b32 +define i32 @redux_sync_xor_b32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.xor.b32 + %val = call i32 @llvm.nvvm.redux.sync.xor(i32 %src, i32 %mask) + ret i32 %val +} + +declare i32 @llvm.nvvm.redux.sync.or(i32, i32) +; CHECK-LABEL: .func{{.*}}redux_sync_or_b32 +define i32 @redux_sync_or_b32(i32 %src, i32 %mask) { + ; CHECK: redux.sync.or.b32 + %val = call i32 @llvm.nvvm.redux.sync.or(i32 %src, i32 %mask) + ret i32 %val +} -- 2.7.4