def NVVM_CpAsyncOp : NVVM_Op<"cp.async.shared.global">,
Arguments<(ins LLVM_i8Ptr_shared:$dst,
LLVM_i8Ptr_global:$src,
- I32Attr:$size)> {
+ I32Attr:$size,
+ OptionalAttr<UnitAttr>:$bypass_l1)> {
string llvmBuilder = [{
llvm::Intrinsic::ID id;
switch ($size) {
id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_8;
break;
case 16:
- id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;
+ if(static_cast<bool>($bypass_l1))
+ id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16;
+ else
+ id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;
break;
default:
llvm_unreachable("unsupported async copy size");
int64_t sizeInBytes =
(dstMemrefType.getElementTypeBitWidth() / 8) * numElements;
rewriter.create<NVVM::CpAsyncOp>(loc, dstPtr, scrPtr,
- rewriter.getI32IntegerAttr(sizeInBytes));
+ rewriter.getI32IntegerAttr(sizeInBytes),
+ /*bypassL1=*/UnitAttr());
// Drop the result token.
Value zero = rewriter.create<LLVM::ConstantOp>(
LogicalResult CpAsyncOp::verify() {
if (size() != 4 && size() != 8 && size() != 16)
return emitError("expected byte size to be either 4, 8 or 16.");
+ if (bypass_l1() && size() != 16)
+ return emitError("bypass l1 is only support for 16 bytes copy.");
return success();
}
// -----
+func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
+ // expected-error @below {{bypass l1 is only support for 16 bytes copy.}}
+ nvvm.cp.async.shared.global %arg0, %arg1, 8 {bypass_l1}
+ return
+}
+
+// -----
+
func.func @gep_struct_variable(%arg0: !llvm.ptr<struct<(i32)>>, %arg1: i32, %arg2: i32) {
// expected-error @below {{op expected index 1 indexing a struct to be constant}}
llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr<struct<(i32)>>, i32, i32) -> !llvm.ptr<i32>
llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
// CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16
nvvm.cp.async.shared.global %arg0, %arg1, 16
+// CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16 {bypass_l1}
+ nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1}
// CHECK: nvvm.cp.async.commit.group
nvvm.cp.async.commit.group
// CHECK: nvvm.cp.async.wait.group 0
nvvm.cp.async.shared.global %arg0, %arg1, 8
// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(i8 addrspace(3)* %{{.*}}, i8 addrspace(1)* %{{.*}})
nvvm.cp.async.shared.global %arg0, %arg1, 16
+// CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16(i8 addrspace(3)* %{{.*}}, i8 addrspace(1)* %{{.*}})
+ nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1}
// CHECK: call void @llvm.nvvm.cp.async.commit.group()
nvvm.cp.async.commit.group
// CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0)