--- /dev/null
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -emit-llvm -fcuda-is-device \
+// RUN: -o - %s | FileCheck --enable-var-scope %s
+
+#define __device__ __attribute__((device))
+
+extern "C" __device__ int printf(const char *format, ...);
+
+__device__ int foo1() {
+ const char *s = "hello world";
+ return printf("%.*f %*.*s %p\n", 8, 3.14159, 8, 4, s, s);
+}
+
+// CHECK-LABEL: @_Z4foo1v()
+// CHECK: [[BEGIN:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK: [[STRLEN1:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
+// CHECK: [[APPEND1:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[BEGIN]], {{.*}}, i64 [[STRLEN1]], i32 0)
+// CHECK: [[APPEND2:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND1]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// CHECK: [[APPEND3:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND2]], i32 1, i64 4614256650576692846, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// CHECK: [[APPEND4:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND3]], i32 1, i64 8, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// CHECK: [[APPEND5:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND4]], i32 1, i64 4, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 0)
+// CHECK: [[STRLEN2:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
+// CHECK: [[APPEND6:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[APPEND5]], {{.*}}, i64 [[STRLEN2]], i32 0)
+// CHECK: [[PTR2INT:%.*]] = ptrtoint i8* %{{.*}} to i64
+// CHECK: [[APPEND7:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND6]], i32 1, i64 [[PTR2INT]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// CHECK: [[RETURN:%.*]] = trunc i64 [[APPEND7]] to i32
+// CHECK: ret i32 [[RETURN]]
+
+__device__ char *dstr;
+
+__device__ int foo2() {
+ return printf("%s %p\n", dstr, dstr);
+}
+
+// CHECK-LABEL: @_Z4foo2v()
+// CHECK: [[BEGIN:%.*]] = call i64 @__ockl_printf_begin(i64 0)
+// CHECK: [[STRLEN1:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
+// CHECK: [[APPEND1:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[BEGIN]], {{.*}}, i64 [[STRLEN1]], i32 0)
+// CHECK: [[STRLEN2:%.*]] = phi i64 [ %{{[^,]*}}, %{{[^ ]*}} ], [ 0, %{{[^ ]*}} ]
+// CHECK: [[APPEND2:%.*]] = call i64 @__ockl_printf_append_string_n(i64 [[APPEND1]], {{.*}}, i64 [[STRLEN2]], i32 0)
+// CHECK: [[PTR2INT:%.*]] = ptrtoint i8* %{{.*}} to i64
+// CHECK: [[APPEND3:%.*]] = call i64 @__ockl_printf_append_args(i64 [[APPEND2]], i32 1, i64 [[PTR2INT]], i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i32 1)
+// CHECK: [[RETURN:%.*]] = trunc i64 [[APPEND3]] to i32
+// CHECK: ret i32 [[RETURN]]
--- /dev/null
+//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to lower a printf call into a series of device
+// library calls on the AMDGPU target.
+//
+// WARNING: This file knows about certain library functions. It recognizes them
+// by name, and hardwires knowledge of their semantics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include <iostream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-emit-printf"
+
+static bool isCString(const Value *Arg) {
+ auto Ty = Arg->getType();
+ auto PtrTy = dyn_cast<PointerType>(Ty);
+ if (!PtrTy)
+ return false;
+
+ auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType());
+ if (!IntTy)
+ return false;
+
+ return IntTy->getBitWidth() == 8;
+}
+
+static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto Ty = Arg->getType();
+
+ if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
+ switch (IntTy->getBitWidth()) {
+ case 32:
+ return Builder.CreateZExt(Arg, Int64Ty);
+ case 64:
+ return Arg;
+ }
+ }
+
+ if (Ty->getTypeID() == Type::DoubleTyID) {
+ return Builder.CreateBitCast(Arg, Int64Ty);
+ }
+
+ if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+ return Builder.CreatePtrToInt(Arg, Int64Ty);
+ }
+
+ llvm_unreachable("unexpected type");
+}
+
+static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto M = Builder.GetInsertBlock()->getModule();
+ auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty);
+ return Builder.CreateCall(Fn, Version);
+}
+
+static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs,
+ Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3,
+ Value *Arg4, Value *Arg5, Value *Arg6,
+ bool IsLast) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto Int32Ty = Builder.getInt32Ty();
+ auto M = Builder.GetInsertBlock()->getModule();
+ auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty,
+ Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty,
+ Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty);
+ auto IsLastValue = Builder.getInt32(IsLast);
+ auto NumArgsValue = Builder.getInt32(NumArgs);
+ return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3,
+ Arg4, Arg5, Arg6, IsLastValue});
+}
+
+static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool IsLast) {
+ auto Arg0 = fitArgInto64Bits(Builder, Arg);
+ auto Zero = Builder.getInt64(0);
+ return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero,
+ Zero, IsLast);
+}
+
+// The device library does not provide strlen, so we build our own loop
+// here. While we are at it, we also include the terminating null in the length.
+static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
+ auto *Prev = Builder.GetInsertBlock();
+ Module *M = Prev->getModule();
+
+ auto CharZero = Builder.getInt8(0);
+ auto One = Builder.getInt64(1);
+ auto Zero = Builder.getInt64(0);
+ auto Int64Ty = Builder.getInt64Ty();
+
+ // The length is either zero for a null pointer, or the computed value for an
+ // actual string. We need a join block for a phi that represents the final
+ // value.
+ //
+ // Strictly speaking, the zero does not matter since
+ // __ockl_printf_append_string_n ignores the length if the pointer is null.
+ BasicBlock *Join = nullptr;
+ if (Prev->getTerminator()) {
+ Join = Prev->splitBasicBlock(Builder.GetInsertPoint(),
+ "strlen.join");
+ Prev->getTerminator()->eraseFromParent();
+ } else {
+ Join = BasicBlock::Create(M->getContext(), "strlen.join",
+ Prev->getParent());
+ }
+ BasicBlock *While =
+ BasicBlock::Create(M->getContext(), "strlen.while",
+ Prev->getParent(), Join);
+ BasicBlock *WhileDone = BasicBlock::Create(
+ M->getContext(), "strlen.while.done",
+ Prev->getParent(), Join);
+
+ // Emit an early return for when the pointer is null.
+ Builder.SetInsertPoint(Prev);
+ auto CmpNull =
+ Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType()));
+ BranchInst::Create(Join, While, CmpNull, Prev);
+
+ // Entry to the while loop.
+ Builder.SetInsertPoint(While);
+
+ auto PtrPhi = Builder.CreatePHI(Str->getType(), 2);
+ PtrPhi->addIncoming(Str, Prev);
+ auto PtrNext = Builder.CreateGEP(PtrPhi, One);
+ PtrPhi->addIncoming(PtrNext, While);
+
+ // Condition for the while loop.
+ auto Data = Builder.CreateLoad(PtrPhi);
+ auto Cmp = Builder.CreateICmpEQ(Data, CharZero);
+ Builder.CreateCondBr(Cmp, WhileDone, While);
+
+ // Add one to the computed length.
+ Builder.SetInsertPoint(WhileDone, WhileDone->begin());
+ auto Begin = Builder.CreatePtrToInt(Str, Int64Ty);
+ auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty);
+ auto Len = Builder.CreateSub(End, Begin);
+ Len = Builder.CreateAdd(Len, One);
+
+ // Final join.
+ BranchInst::Create(Join, WhileDone);
+ Builder.SetInsertPoint(Join, Join->begin());
+ auto LenPhi = Builder.CreatePHI(Len->getType(), 2);
+ LenPhi->addIncoming(Len, WhileDone);
+ LenPhi->addIncoming(Zero, Prev);
+
+ return LenPhi;
+}
+
+static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
+ Value *Length, bool isLast) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto CharPtrTy = Builder.getInt8PtrTy();
+ auto Int32Ty = Builder.getInt32Ty();
+ auto M = Builder.GetInsertBlock()->getModule();
+ auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty,
+ Int64Ty, CharPtrTy, Int64Ty, Int32Ty);
+ auto IsLastInt32 = Builder.getInt32(isLast);
+ return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32});
+}
+
+static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool IsLast) {
+ auto Length = getStrlenWithNull(Builder, Arg);
+ return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
+}
+
+static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool SpecIsCString, bool IsLast) {
+ if (SpecIsCString && isCString(Arg)) {
+ return appendString(Builder, Desc, Arg, IsLast);
+ }
+ // If the format specifies a string but the argument is not, the frontend will
+ // have printed a warning. We just rely on undefined behaviour and send the
+ // argument anyway.
+ return appendArg(Builder, Desc, Arg, IsLast);
+}
+
+// Scan the format string to locate all specifiers, and mark the ones that
+// specify a string, i.e, the "%s" specifier with optional '*' characters.
+static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) {
+ StringRef Str;
+ if (!getConstantStringInfo(Fmt, Str) || Str.empty())
+ return;
+
+ static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
+ size_t SpecPos = 0;
+ // Skip the first argument, the format string.
+ unsigned ArgIdx = 1;
+
+ while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) {
+ if (Str[SpecPos + 1] == '%') {
+ SpecPos += 2;
+ continue;
+ }
+ auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos);
+ if (SpecEnd == StringRef::npos)
+ return;
+ auto Spec = Str.slice(SpecPos, SpecEnd + 1);
+ ArgIdx += Spec.count('*');
+ if (Str[SpecEnd] == 's') {
+ BV.set(ArgIdx);
+ }
+ SpecPos = SpecEnd + 1;
+ ++ArgIdx;
+ }
+}
+
+Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder,
+ ArrayRef<Value *> Args) {
+ auto NumOps = Args.size();
+ assert(NumOps >= 1);
+
+ auto Fmt = Args[0];
+ SparseBitVector<8> SpecIsCString;
+ locateCStrings(SpecIsCString, Fmt);
+
+ auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0));
+ Desc = appendString(Builder, Desc, Fmt, NumOps == 1);
+
+ // FIXME: This invokes hostcall once for each argument. We can pack up to
+ // seven scalar printf arguments in a single hostcall. See the signature of
+ // callAppendArgs().
+ for (unsigned int i = 1; i != NumOps; ++i) {
+ bool IsLast = i == NumOps - 1;
+ bool IsCString = SpecIsCString.test(i);
+ Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast);
+ }
+
+ return Builder.CreateTrunc(Desc, Builder.getInt32Ty());
+}