Part of https://discourse.llvm.org/t/rfc-switching-the-llvm-dialect-and-dialect-lowerings-to-opaque-pointers/68179
This patch adds the new pass option `use-opaque-pointers` to the GPU to LLVM lowerings (including ROCD and NVVM) and adapts the code to support using opaque pointers in addition to typed pointers.
The required changes mostly boil down to avoiding `getElementType` and specifying base types in GEP and Alloca.
In the future opaque pointers will be the only supported model, hence tests have been ported to using opaque pointers by default. Additional regression tests for typed-pointers have been added to avoid breaking existing clients.
Note: This does not yet port the `GpuToVulkan` passes.
Differential Revision: https://reviews.llvm.org/D144448
/*default=*/"gpu::getDefaultGpuBinaryAnnotation()",
"Annotation attribute string for GPU binary"
>,
+ Option<"useOpaquePointers", "use-opaque-pointers", "bool",
+ /*default=*/"false", "Generate LLVM IR using opaque pointers "
+ "instead of typed pointers">,
];
let dependentDialects = [
"Bitwidth of the index type, 0 to use size of machine word">,
Option<"hasRedux", "has-redux", "bool", /*default=*/"false",
"Target gpu supports redux">,
+ Option<"useOpaquePointers", "use-opaque-pointers", "bool",
+ /*default=*/"false", "Generate LLVM IR using opaque pointers "
+ "instead of typed pointers">,
];
}
clEnumValN(::mlir::gpu::amd::Runtime::Unknown, "unknown", "Unknown (default)"),
clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL", "OpenCL")
- )}]>
+ )}]>,
+ Option<"useOpaquePointers", "use-opaque-pointers", "bool",
+ /*default=*/"false", "Generate LLVM IR using opaque pointers "
+ "instead of typed pointers">,
];
}
/// global and use it to compute the address of the first character in the
/// string (operations inserted at the builder insertion point).
Value createGlobalString(Location loc, OpBuilder &builder, StringRef name,
- StringRef value, Linkage linkage);
+ StringRef value, Linkage linkage,
+ bool useOpaquePointers);
/// LLVM requires some operations to be inside of a Module operation. This
/// function confirms that the Operation has the desired properties.
workgroupBuffers.push_back(globalOp);
}
- // Rewrite the original GPU function to an LLVM function.
- auto convertedType = typeConverter->convertType(gpuFuncOp.getFunctionType());
- if (!convertedType)
- return failure();
- auto funcType =
- convertedType.template cast<LLVM::LLVMPointerType>().getElementType();
-
// Remap proper input types.
TypeConverter::SignatureConversion signatureConversion(
gpuFuncOp.front().getNumArguments());
- getTypeConverter()->convertFunctionSignature(
+ Type funcType = getTypeConverter()->convertFunctionSignature(
gpuFuncOp.getFunctionType(), /*isVariadic=*/false, signatureConversion);
// Create the new function operation. Only copy those attributes that are
for (const auto &en : llvm::enumerate(workgroupBuffers)) {
LLVM::GlobalOp global = en.value();
- Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
+ Value address = rewriter.create<LLVM::AddressOfOp>(
+ loc,
+ getTypeConverter()->getPointerType(global.getType(),
+ global.getAddrSpace()),
+ global.getSymNameAttr());
auto elementType =
global.getType().cast<LLVM::LLVMArrayType>().getElementType();
Value memory = rewriter.create<LLVM::GEPOp>(
- loc, LLVM::LLVMPointerType::get(elementType, global.getAddrSpace()),
- address, ArrayRef<LLVM::GEPArg>{0, 0});
+ loc,
+ getTypeConverter()->getPointerType(elementType,
+ global.getAddrSpace()),
+ global.getType(), address, ArrayRef<LLVM::GEPArg>{0, 0});
// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
- auto ptrType = LLVM::LLVMPointerType::get(
- typeConverter->convertType(type.getElementType())
- .template cast<Type>(),
- allocaAddrSpace);
+ Type elementType = typeConverter->convertType(type.getElementType());
+ auto ptrType =
+ getTypeConverter()->getPointerType(elementType, allocaAddrSpace);
Value numElements = rewriter.create<LLVM::ConstantOp>(
gpuFuncOp.getLoc(), int64Ty, type.getNumElements());
Value allocated = rewriter.create<LLVM::AllocaOp>(
- gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
+ gpuFuncOp.getLoc(), ptrType, elementType, numElements,
+ /*alignment=*/0);
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, *getTypeConverter(), type, allocated);
signatureConversion.remapInput(
Location loc = gpuPrintfOp->getLoc();
mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type());
- mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8);
+ mlir::Type i8Ptr = getTypeConverter()->getPointerType(llvmI8);
mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type());
mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type());
// Note: this is the GPUModule op, not the ModuleOp that surrounds it
}
// Get a pointer to the format string's first element and pass it to printf()
- Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
+ Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
+ loc,
+ getTypeConverter()->getPointerType(globalType, global.getAddrSpace()),
+ global.getSymNameAttr());
Value stringStart = rewriter.create<LLVM::GEPOp>(
- loc, i8Ptr, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+ loc, i8Ptr, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
Value stringLen =
rewriter.create<LLVM::ConstantOp>(loc, llvmI64, formatStringSize);
Location loc = gpuPrintfOp->getLoc();
mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8));
- mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8, addressSpace);
+ mlir::Type i8Ptr = getTypeConverter()->getPointerType(llvmI8, addressSpace);
// Note: this is the GPUModule op, not the ModuleOp that surrounds it
// This ensures that global constants and declarations are placed within
}
// Get a pointer to the format string's first element
- Value globalPtr = rewriter.create<LLVM::AddressOfOp>(loc, global);
+ Value globalPtr = rewriter.create<LLVM::AddressOfOp>(
+ loc,
+ getTypeConverter()->getPointerType(globalType, global.getAddrSpace()),
+ global.getSymNameAttr());
Value stringStart = rewriter.create<LLVM::GEPOp>(
- loc, i8Ptr, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
+ loc, i8Ptr, globalType, globalPtr, ArrayRef<LLVM::GEPArg>{0, 0});
// Construct arguments and function call
auto argsRange = adaptor.getArgs();
Type llvmVoidType = LLVM::LLVMVoidType::get(context);
LLVM::LLVMPointerType llvmPointerType =
- LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
- Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType);
+ this->getTypeConverter()->getPointerType(IntegerType::get(context, 8));
+ Type llvmPointerPointerType =
+ this->getTypeConverter()->getPointerType(llvmPointerType);
Type llvmInt8Type = IntegerType::get(context, 8);
Type llvmInt32Type = IntegerType::get(context, 32);
Type llvmInt64Type = IntegerType::get(context, 64);
} // namespace
void GpuToLLVMConversionPass::runOnOperation() {
- LLVMTypeConverter converter(&getContext());
+ LowerToLLVMOptions options(&getContext());
+ options.useOpaquePointers = useOpaquePointers;
+
+ LLVMTypeConverter converter(&getContext(), options);
RewritePatternSet patterns(&getContext());
LLVMConversionTarget target(getContext());
auto stream = adaptor.getAsyncDependencies().front();
Value allocatedPtr =
allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult();
- allocatedPtr =
- rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
+ if (!getTypeConverter()->useOpaquePointers())
+ allocatedPtr =
+ rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
// No alignment.
Value alignedPtr = allocatedPtr;
Value pointer =
MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
- auto casted = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
+ if (!getTypeConverter()->useOpaquePointers())
+ pointer = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
Value stream = adaptor.getAsyncDependencies().front();
- deallocCallBuilder.create(loc, rewriter, {casted, stream});
+ deallocCallBuilder.create(loc, rewriter, {pointer, stream});
rewriter.replaceOp(deallocOp, {stream});
return success();
argumentTypes);
auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 1);
auto structPtr = builder.create<LLVM::AllocaOp>(
- loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0);
+ loc, getTypeConverter()->getPointerType(structType), structType, one,
+ /*alignment=*/0);
auto arraySize =
builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, numArguments);
- auto arrayPtr = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
- arraySize, /*alignment=*/0);
+ auto arrayPtr = builder.create<LLVM::AllocaOp>(
+ loc, llvmPointerPointerType, llvmPointerType, arraySize, /*alignment=*/0);
for (const auto &en : llvm::enumerate(arguments)) {
- auto fieldPtr = builder.create<LLVM::GEPOp>(
- loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr,
+ Value fieldPtr = builder.create<LLVM::GEPOp>(
+ loc, getTypeConverter()->getPointerType(argumentTypes[en.index()]),
+ argumentTypes[en.index()], structPtr,
ArrayRef<LLVM::GEPArg>{0, en.index()});
builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
- auto elementPtr =
- builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, arrayPtr,
- ArrayRef<LLVM::GEPArg>{en.index()});
- auto casted =
- builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
- builder.create<LLVM::StoreOp>(loc, casted, elementPtr);
+ auto elementPtr = builder.create<LLVM::GEPOp>(
+ loc, llvmPointerPointerType, llvmPointerType, arrayPtr,
+ ArrayRef<LLVM::GEPArg>{en.index()});
+ if (!getTypeConverter()->useOpaquePointers())
+ fieldPtr =
+ builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
+ builder.create<LLVM::StoreOp>(loc, fieldPtr, elementPtr);
}
return arrayPtr;
}
std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
return LLVM::createGlobalString(
loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
- LLVM::Linkage::Internal);
+ LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
}
// Emits LLVM IR to launch a kernel function. Expects the module that contains
SmallString<128> nameBuffer(kernelModule.getName());
nameBuffer.append(kGpuBinaryStorageSuffix);
- Value data =
- LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
- binaryAttr.getValue(), LLVM::Linkage::Internal);
+ Value data = LLVM::createGlobalString(
+ loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
+ LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
// Get the function from the module. The name corresponds to the name of
destinationType.getAddressSpace()),
sourcePtr);
+ if (typeConverter.useOpaquePointers())
+ return sourcePtr;
+
return rewriter.create<LLVM::BitcastOp>(loc, destinationType, sourcePtr);
}
Type elementPtrType = getElementPtrType(memRefType);
Value nullPtr = rewriter.create<LLVM::NullOp>(loc, elementPtrType);
- Value gepPtr =
- rewriter.create<LLVM::GEPOp>(loc, elementPtrType, nullPtr, numElements);
+ Value gepPtr = rewriter.create<LLVM::GEPOp>(
+ loc, elementPtrType,
+ typeConverter->convertType(memRefType.getElementType()), nullPtr,
+ numElements);
auto sizeBytes =
rewriter.create<LLVM::PtrToIntOp>(loc, getIndexType(), gepPtr);
RewritePatternSet &patterns,
StringRef gpuBinaryAnnotation,
bool kernelBarePtrCallConv) {
- converter.addConversion(
- [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type {
- return LLVM::LLVMPointerType::get(IntegerType::get(context, 8));
- });
+ converter.addConversion([&converter](gpu::AsyncTokenType type) -> Type {
+ return converter.getPointerType(
+ IntegerType::get(&converter.getContext(), 8));
+ });
patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,
DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
options.overrideIndexBitwidth(indexBitwidth);
+ options.useOpaquePointers = useOpaquePointers;
// Apply in-dialect lowering. In-dialect lowering will replace
// ops which need to be lowered further, which is not supported by a
ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
options.overrideIndexBitwidth(indexBitwidth);
+ options.useOpaquePointers = useOpaquePointers;
if (useBarePtrCallConv) {
options.useBarePtrCallConv = true;
std::string entryPointGlobalName = (name + "_spv_entry_point_name").str();
return LLVM::createGlobalString(loc, builder, entryPointGlobalName,
- shaderName, LLVM::Linkage::Internal);
+ shaderName, LLVM::Linkage::Internal,
+ /*TODO:useOpaquePointers=*/false);
}
void VulkanLaunchFuncToVulkanCallsPass::translateVulkanLaunchCall(
// that data to runtime call.
Value ptrToSPIRVBinary = LLVM::createGlobalString(
loc, builder, kSPIRVBinary, spirvAttributes.first.getValue(),
- LLVM::Linkage::Internal);
+ LLVM::Linkage::Internal, /*TODO:useOpaquePointers=*/false);
// Create LLVM constant for the size of SPIR-V binary shader.
Value binarySize = builder.create<LLVM::ConstantOp>(
Value mlir::LLVM::createGlobalString(Location loc, OpBuilder &builder,
StringRef name, StringRef value,
- LLVM::Linkage linkage) {
+ LLVM::Linkage linkage,
+ bool useOpaquePointers) {
assert(builder.getInsertionBlock() &&
builder.getInsertionBlock()->getParentOp() &&
"expected builder to point to a block constrained in an op");
loc, type, /*isConstant=*/true, linkage, name,
builder.getStringAttr(value), /*alignment=*/0);
+ LLVMPointerType resultType;
+ LLVMPointerType charPtr;
+ if (!useOpaquePointers) {
+ resultType = LLVMPointerType::get(type);
+ charPtr = LLVMPointerType::get(IntegerType::get(ctx, 8));
+ } else {
+ resultType = charPtr = LLVMPointerType::get(ctx);
+ }
+
// Get the pointer to the first character in the global string.
- Value globalPtr = builder.create<LLVM::AddressOfOp>(loc, global);
- return builder.create<LLVM::GEPOp>(
- loc, LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8)), globalPtr,
- ArrayRef<GEPArg>{0, 0});
+ Value globalPtr = builder.create<LLVM::AddressOfOp>(loc, resultType,
+ global.getSymNameAttr());
+ return builder.create<LLVM::GEPOp>(loc, charPtr, type, globalPtr,
+ ArrayRef<GEPArg>{0, 0});
}
bool mlir::LLVM::satisfiesLLVMModule(Operation *op) {
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
module attributes {gpu.container_module} {
// CHECK-LABEL: llvm.func @main
// CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
%1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
// CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
- // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
- // CHECK: llvm.call @mgpuMemFree(%[[void_ptr]], %[[stream]])
+ // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]])
%3 = gpu.dealloc async [%2] %1 : memref<?xf32>
// CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]])
// CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
-// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" | FileCheck %s
-// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL
+// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" | FileCheck %s --check-prefix=ROCDL
module attributes {gpu.container_module} {
// CHECK-DAG: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64
// CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]]
// CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0]
- // CHECK-SAME: -> !llvm.ptr<i8>
+ // CHECK-SAME: -> !llvm.ptr
// CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]])
// CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}})
// CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate
// CHECK: [[NUM_PARAMS:%.*]] = llvm.mlir.constant(6 : i32) : i32
- // CHECK-NEXT: [[PARAMS:%.*]] = llvm.alloca [[NUM_PARAMS]] x !llvm.ptr<i8>
+ // CHECK-NEXT: [[PARAMS:%.*]] = llvm.alloca [[NUM_PARAMS]] x !llvm.ptr
- // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr<ptr<i8>>
+ // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr
// CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]],
// CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]],
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
module attributes {gpu.container_module} {
%t0 = gpu.wait async
// CHECK: %[[size_bytes:.*]] = llvm.ptrtoint
// CHECK-NOT: llvm.addrspacecast
- // CHECK: %[[src:.*]] = llvm.bitcast
// CHECK: %[[addr_cast:.*]] = llvm.addrspacecast
- // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]]
- // CHECK: llvm.call @mgpuMemcpy(%[[dst]], %[[src]], %[[size_bytes]], %[[t0]])
+ // CHECK: llvm.call @mgpuMemcpy(%[[addr_cast]], %{{.*}}, %[[size_bytes]], %[[t0]])
%t1 = gpu.memcpy async [%t0] %dst, %src : memref<7xf32, 1>, memref<7xf32>
// CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
// CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
--- /dev/null
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=0' | FileCheck %s --check-prefixes=CHECK,NVVM
+
+gpu.module @kernel {
+ gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space<private>>) {
+ %c0 = arith.constant 0 : index
+ memref.store %arg0, %arg1[%c0] : memref<4xf32, #gpu.address_space<private>>
+ gpu.return
+ }
+}
+
+// CHECK-LABEL: llvm.func @private
+// CHECK: llvm.store
+// ROCDL-SAME: : !llvm.ptr<f32, 5>
+// NVVM-SAME: : !llvm.ptr<f32>
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=1' | FileCheck %s --check-prefixes=CHECK,NVVM
gpu.module @kernel {
gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space<private>>) {
// CHECK-LABEL: llvm.func @private
// CHECK: llvm.store
-// ROCDL-SAME: : !llvm.ptr<f32, 5>
-// NVVM-SAME: : !llvm.ptr<f32>
+// ROCDL-SAME: : f32, !llvm.ptr<5>
+// NVVM-SAME: : f32, !llvm.ptr
// -----
// CHECK-LABEL: llvm.func @workgroup
// CHECK: llvm.store
-// CHECK-SAME: : !llvm.ptr<f32, 3>
+// CHECK-SAME: : f32, !llvm.ptr<3>
// -----
// CHECK-LABEL: llvm.func @nested_memref
// CHECK: llvm.load
-// CHECK-SAME: : !llvm.ptr<{{.*}}, 1>
+// CHECK-SAME: : !llvm.ptr<1>
// CHECK: [[value:%.+]] = llvm.load
-// CHECK-SAME: : !llvm.ptr<f32, 1>
+// CHECK-SAME: : !llvm.ptr<1> -> f32
// CHECK: llvm.return [[value]]
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
module attributes {gpu.container_module} {
// CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
%t0 = gpu.wait async
// CHECK: %[[size_bytes:.*]] = llvm.mlir.constant
- // CHECK: %[[value:.*]] = llvm.bitcast
// CHECK: %[[addr_cast:.*]] = llvm.addrspacecast
- // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]]
- // CHECK: llvm.call @mgpuMemset32(%[[dst]], %[[value]], %[[size_bytes]], %[[t0]])
+ // CHECK: llvm.call @mgpuMemset32(%[[addr_cast]], %{{.*}}, %[[size_bytes]], %[[t0]])
%t1 = gpu.memset async [%t0] %dst, %value : memref<7xf32, 1>, f32
// CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
// CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
module attributes {gpu.container_module} {
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm='use-opaque-pointers=1' --split-input-file %s | FileCheck --check-prefix=NVVM %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='use-opaque-pointers=1' --split-input-file %s | FileCheck --check-prefix=ROCDL %s
gpu.module @kernel {
// NVVM-LABEL: llvm.func @private
gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space<private>>) {
// Allocate private memory inside the function.
// NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64
- // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr<f32>
+ // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr
// ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64
- // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr<f32, 5>
+ // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr<5>
// Populate the memref descriptor.
- // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
+ // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
- // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32, 5>, ptr<f32, 5>, i64, array<1 x i64>, array<1 x i64>)>
+ // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)>
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
// ROCDL-SAME: {
gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, #gpu.address_space<workgroup>>) {
// Get the address of the first element in the global array.
- // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<array<4 x f32>, 3>
+ // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
// NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
- // NVVM-SAME: !llvm.ptr<f32, 3>
+ // NVVM-SAME: !llvm.ptr<3>
- // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<array<4 x f32>, 3>
+ // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
// ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
- // ROCDL-SAME: !llvm.ptr<f32, 3>
+ // ROCDL-SAME: !llvm.ptr<3>
// Populate the memref descriptor.
- // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32, 3>, ptr<f32, 3>, i64, array<1 x i64>, array<1 x i64>)>
+ // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
- // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32, 3>, ptr<f32, 3>, i64, array<1 x i64>, array<1 x i64>)>
+ // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)>
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
// ROCDL-LABEL: llvm.func @workgroup3d
gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, #gpu.address_space<workgroup>>) {
// Get the address of the first element in the global array.
- // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<array<48 x f32>, 3>
+ // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
// NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
- // NVVM-SAME: !llvm.ptr<f32, 3>
+ // NVVM-SAME: !llvm.ptr<3>
- // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<array<48 x f32>, 3>
+ // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3>
// ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0]
- // ROCDL-SAME: !llvm.ptr<f32, 3>
+ // ROCDL-SAME: !llvm.ptr<3>
// Populate the memref descriptor.
- // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32, 3>, ptr<f32, 3>, i64, array<3 x i64>, array<3 x i64>)>
+ // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64
// NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
- // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<f32, 3>, ptr<f32, 3>, i64, array<3 x i64>, array<3 x i64>)>
+ // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64
// Private buffers.
// NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
- // NVVM: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr<f32>
+ // NVVM: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
- // NVVM: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr<f32>
+ // NVVM: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr
// ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
- // ROCDL: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr<f32, 5>
+ // ROCDL: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr<5>
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
- // ROCDL: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr<f32, 5>
+ // ROCDL: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr<5>
%c0 = arith.constant 0 : index
memref.store %arg0, %arg1[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
-// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s\r
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s\r
\r
func.func @warp_extract(%arg0: index, %arg1: memref<1024x1024xf32>, %arg2: index, %arg3: vector<1xf32>) {\r
%c0 = arith.constant 0 : index\r
--- /dev/null
+// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=0' --split-input-file | FileCheck %s
+
+module attributes {gpu.container_module} {
+ // CHECK-LABEL: llvm.func @main
+ // CHECK-SAME: %[[size:.*]]: i64
+ func.func @main(%size : index) {
+ // CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate()
+ %0 = gpu.wait async
+ // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
+ // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
+ // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]])
+ %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
+ // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
+ // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
+ // CHECK: llvm.call @mgpuMemFree(%[[void_ptr]], %[[stream]])
+ %3 = gpu.dealloc async [%2] %1 : memref<?xf32>
+ // CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]])
+ // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
+ gpu.wait [%3]
+ return
+ }
+
+ // CHECK: func @foo
+ func.func @foo(%dst : memref<7xf32, 1>, %src : memref<7xf32>) {
+ // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
+ %t0 = gpu.wait async
+ // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint
+ // CHECK-NOT: llvm.addrspacecast
+ // CHECK: %[[src:.*]] = llvm.bitcast
+ // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast
+ // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]]
+ // CHECK: llvm.call @mgpuMemcpy(%[[dst]], %[[src]], %[[size_bytes]], %[[t0]])
+ %t1 = gpu.memcpy async [%t0] %dst, %src : memref<7xf32, 1>, memref<7xf32>
+ // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
+ // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
+ gpu.wait [%t1]
+ return
+ }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+
+ // CHECK: func @foo
+ func.func @foo(%dst : memref<7xf32, 1>, %value : f32) {
+ // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
+ %t0 = gpu.wait async
+ // CHECK: %[[size_bytes:.*]] = llvm.mlir.constant
+ // CHECK: %[[value:.*]] = llvm.bitcast
+ // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast
+ // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]]
+ // CHECK: llvm.call @mgpuMemset32(%[[dst]], %[[value]], %[[size_bytes]], %[[t0]])
+ %t1 = gpu.memset async [%t0] %dst, %value : memref<7xf32, 1>, f32
+ // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
+ // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
+ gpu.wait [%t1]
+ return
+ }
+}
+
// CHECK: gpu.module @foo attributes {gpu.binary = "CUBIN"}
gpu.module @foo {
- llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr<f32>)
+ llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr)
// CHECK: attributes {gpu.kernel}
attributes { gpu.kernel } {
llvm.return
-// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-opaque-pointers=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck --check-prefix=CHECK32 %s
gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_ops()
--- /dev/null
+// RUN: mlir-opt --convert-gpu-to-nvvm="use-opaque-pointers=0" --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32 use-opaque-pointers=0" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s
+
+gpu.module @test_module {
+
+ // CHECK-LABEL: func @gpu_wmma_load_op() ->
+ // CHECK-SAME: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+ // CHECK32-LABEL: func @gpu_wmma_load_op() ->
+ func.func @gpu_wmma_load_op() -> (!gpu.mma_matrix<16x16xf16, "AOp">) {
+ %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
+ %i = arith.constant 16 : index
+ %j = arith.constant 16 : index
+ %0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
+ // CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
+ // CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+ // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
+ // CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
+ // CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64
+ // CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
+ // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
+ // CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
+ // CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
+ // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+ // CHECK: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+
+ // CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
+ // CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
+ // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
+ // CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
+ // CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32
+ // CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
+ // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
+ // CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
+ // CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
+ // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+ // CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+ return %0 : !gpu.mma_matrix<16x16xf16, "AOp">
+ }
+}
-// RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
-// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt --convert-gpu-to-nvvm='use-opaque-pointers=1' --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32 use-opaque-pointers=1" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s
gpu.module @test_module {
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
// CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
// CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
- // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
+ // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
// CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64
// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
- // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
+ // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
- // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+ // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
// CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
- // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
+ // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)>
// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32
// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
- // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
+ // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16
// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
- // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+ // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
return %0 : !gpu.mma_matrix<16x16xf16, "AOp">
}
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xi8, 3> -> !gpu.mma_matrix<16x16xsi8, "AOp">
// CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
// CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
- // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<i8, 3>, ptr<i8, 3>, i64, array<2 x i64>, array<2 x i64>)>
+ // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
// CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64
// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
- // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<i8, 3>, i64) -> !llvm.ptr<i8, 3>
+ // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i8
// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
- // CHECK-SAME: {eltype = #nvvm.mma_type<s8>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i8, 3>) -> !llvm.struct<(i32, i32)>
+ // CHECK-SAME: {eltype = #nvvm.mma_type<s8>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
// CHECK: llvm.return %[[FRAG]] : !llvm.struct<(i32, i32)>
// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
// CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
- // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<i8, 3>, ptr<i8, 3>, i32, array<2 x i32>, array<2 x i32>)>
+ // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)>
// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32
// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
- // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<i8, 3>, i32) -> !llvm.ptr<i8, 3>
+ // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, i8
// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
- // CHECK32-SAME: {eltype = #nvvm.mma_type<s8>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i8, 3>) -> !llvm.struct<(i32, i32)>
+ // CHECK32-SAME: {eltype = #nvvm.mma_type<s8>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)>
// CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(i32, i32)>
return %0 : !gpu.mma_matrix<16x16xsi8, "AOp">
}
// CHECK: %[[EL2:.*]] = llvm.extractvalue %[[D]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[EL3:.*]] = llvm.extractvalue %[[D]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[EL4:.*]] = llvm.extractvalue %[[D]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
- // CHECK: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
+ // CHECK: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
// CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64
// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
- // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
+ // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16
// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK: nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
- // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
+ // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
// CHECK: llvm.return
// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
// CHECK32: %[[EL2:.*]] = llvm.extractvalue %[[D]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: %[[EL3:.*]] = llvm.extractvalue %[[D]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: %[[EL4:.*]] = llvm.extractvalue %[[D]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
- // CHECK32: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
+ // CHECK32: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)>
// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32
// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
- // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
+ // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16
// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
- // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
+ // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
// CHECK32: llvm.return
return
}
gpu.module @test_module {
// CHECK-LABEL: func @gpu_wmma_mma_loop_op
-// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<c>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
// CHECK: ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>): // 2 preds: ^bb0, ^bb2
// CHECK: llvm.cond_br %{{.*}}, ^bb2, ^bb3
// CHECK: ^bb2: // pred: ^bb1
-// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
+// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<b>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A0:.+]] = llvm.extractvalue %[[A]][0] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A1:.+]] = llvm.extractvalue %[[A]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A2:.+]] = llvm.extractvalue %[[A]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[E1:.+]] = llvm.extractvalue %[[ACC]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[E2:.+]] = llvm.extractvalue %[[ACC]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[E3:.+]] = llvm.extractvalue %[[ACC]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-// CHECK: nvvm.wmma.store %{{.*}}, %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
+// CHECK: nvvm.wmma.store %{{.*}}, %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = #nvvm.mma_type<f16>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : !llvm.ptr, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
func.func @gpu_wmma_mma_loop_op(%arg0: memref<128x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<128x128xf16>) {
%c0 = arith.constant 0 : index
-// RUN: mlir-opt %s -convert-gpu-to-rocdl=runtime=HIP -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP use-opaque-pointers=1' -split-input-file | FileCheck %s
gpu.module @test_module {
// CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")
// CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00")
// CHECK-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
- // CHECK-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr<i8>, i64, i32) -> i64
+ // CHECK-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64
// CHECK-DAG: llvm.func @__ockl_printf_begin(i64) -> i64
// CHECK-LABEL: func @test_const_printf
gpu.func @test_const_printf() {
// CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
// CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64
- // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL0]] : !llvm.ptr<array<14 x i8>>
- // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr<array<14 x i8>>) -> !llvm.ptr<i8>
+ // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL0]] : !llvm.ptr
+ // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<14 x i8>
// CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(14 : i64) : i64
// CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32
- // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISLAST]]) : (i64, !llvm.ptr<i8>, i64, i32) -> i64
+ // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64
gpu.printf "Hello, world\n"
gpu.return
}
gpu.func @test_printf(%arg0: i32) {
// CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
// CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64
- // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr<array<11 x i8>>
- // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr<array<11 x i8>>) -> !llvm.ptr<i8>
+ // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr
+ // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<11 x i8>
// CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64
// CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32
- // CHECK-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr<i8>, i64, i32) -> i64
+ // CHECK-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64
// CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
// CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
-// RUN: mlir-opt %s -convert-gpu-to-rocdl=runtime=OpenCL | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL use-opaque-pointers=1' | FileCheck %s
gpu.module @test_module {
// CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 4 : i32}
- // CHECK: llvm.func @printf(!llvm.ptr<i8, 4>, ...) -> i32
+ // CHECK: llvm.func @printf(!llvm.ptr<4>, ...) -> i32
// CHECK-LABEL: func @test_printf
// CHECK: (%[[ARG0:.*]]: i32)
gpu.func @test_printf(%arg0: i32) {
- // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<array<11 x i8>, 4>
- // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<array<11 x i8>, 4>) -> !llvm.ptr<i8, 4>
- // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) : (!llvm.ptr<i8, 4>, i32) -> i32
+ // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<4>
+ // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<4>) -> !llvm.ptr<4>, !llvm.array<11 x i8>
+ // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) : (!llvm.ptr<4>, i32) -> i32
gpu.printf "Hello: %d\n" %arg0 : i32
gpu.return
}
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='use-opaque-pointers=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck --check-prefix=CHECK32 %s
gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_ops()
}
}
-// ----
+// -----
gpu.module @module {
// CHECK-LABEL: @spirv_exp
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='use-opaque-pointers=1' -split-input-file | FileCheck %s
// RUN: mlir-opt %s \
-// RUN: -convert-gpu-to-rocdl=use-bare-ptr-memref-call-conv=true \
+// RUN: -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true use-opaque-pointers=1' \
// RUN: -split-input-file \
// RUN: | FileCheck %s --check-prefix=BARE
gpu.module @memref_conversions {
// CHECK: llvm.func @kern
- // CHECK-SAME: (%{{.*}}: !llvm.ptr<f32>, %{{.*}}: !llvm.ptr<f32>, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64)
+ // CHECK-SAME: (%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64)
// BARE: llvm.func @kern
- // BARE-SAME: (%{{.*}}: !llvm.ptr<f32>)
+ // BARE-SAME: (%{{.*}}: !llvm.ptr)
gpu.func @kern(%arg0: memref<8xf32>) kernel {
gpu.return
}
--- /dev/null
+// RUN: mlir-opt %s -convert-gpu-to-rocdl="runtime=HIP use-opaque-pointers=0" -split-input-file | FileCheck --check-prefixes=CHECK,HIP %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl="runtime=OpenCL use-opaque-pointers=0" | FileCheck --check-prefixes=CHECK,OCL %s
+
+gpu.module @test_module {
+ // HIP-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00")
+ // HIP-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
+ // HIP-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr<i8>, i64, i32) -> i64
+ // HIP-DAG: llvm.func @__ockl_printf_begin(i64) -> i64
+
+ // OCL: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 4 : i32}
+ // OCL: llvm.func @printf(!llvm.ptr<i8, 4>, ...) -> i32
+ // CHECK-LABEL: func @test_printf
+ // CHECK: (%[[ARG0:.*]]: i32)
+ gpu.func @test_printf(%arg0: i32) {
+ // OCL: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<array<11 x i8>, 4>
+ // OCL-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<array<11 x i8>, 4>) -> !llvm.ptr<i8, 4>
+ // OCL-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) : (!llvm.ptr<i8, 4>, i32) -> i32
+
+ // HIP: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
+ // HIP-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64
+ // HIP-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr<array<11 x i8>>
+ // HIP-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr<array<11 x i8>>) -> !llvm.ptr<i8>
+ // HIP-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64
+ // HIP-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32
+ // HIP-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32
+ // HIP-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr<i8>, i64, i32) -> i64
+ // HIP-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
+ // HIP-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
+ // HIP-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
+
+ gpu.printf "Hello: %d\n" %arg0 : i32
+ gpu.return
+ }
+}
// CHECK: gpu.module @foo attributes {gpu.binary = "HSACO"}
gpu.module @foo {
- llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr<f32>)
+ llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr)
// CHECK: attributes {gpu.kernel}
attributes { gpu.kernel } {
llvm.return