From e3515ba3816b9cabeca6a3b03b90902ebcfd3c65 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot@amd.com>
Date: Fri, 10 Feb 2023 17:03:30 +0000
Subject: [PATCH] Reapply "[AMDGPU] Modify adjustInliningThreshold to also
 consider the cost of passing function arguments through the stack"

Reapplies 142c28ffa1323e9a8d53200a22c80d5d778e0d0f as part of D140242 which got reverted due to amdgpu openmp test failures.

This diff fixes said failures by eliding most of `adjustInliningThresholdUsingCallee` for indirect calls as the callee function is unavailable for indirect calls.

Reviewed By: arsenm, #amdgpu

Differential Revision: https://reviews.llvm.org/D143498
---
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp    |  56 ++++++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  33 +++-
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |   3 +
 .../AMDGPU/amdgpu-inline-stack-argument-i64.ll     | 100 ++++++++++++
 .../Inline/AMDGPU/amdgpu-inline-stack-argument.ll  | 164 ++++++++++++++++++++
 .../amdgpu-inline-stack-array-ptr-argument.ll      | 118 ++++++++++++++
 .../amdgpu-inline-stack-indirect-call-argument.ll  |  21 +++
 .../AMDGPU/amdgpu-inline-stack-ptr-argument.ll     | 112 ++++++++++++++
 .../AMDGPU/amdgpu-inline-stack-struct-argument.ll  | 171 +++++++++++++++++++++
 .../amdgpu-inline-stack-vector-ptr-argument.ll     | 118 ++++++++++++++
 10 files changed, 889 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll
 create mode 100644 llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll
 create mode 100644 llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll
 create mode 100644 llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll
 create mode 100644 llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll
 create mode 100644 llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll
 create mode 100644 llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 87e292f..e2c647f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -17,8 +17,10 @@
 #include "AMDGPUTargetTransformInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/PatternMatch.h"
@@ -1167,10 +1169,57 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
   return true;
 }
 
+static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
+                                                   const SITargetLowering *TLI,
+                                                   const GCNTTIImpl *TTIImpl) {
+  const int NrOfSGPRUntilSpill = 26;
+  const int NrOfVGPRUntilSpill = 32;
+
+  const DataLayout &DL = TTIImpl->getDataLayout();
+
+  unsigned adjustThreshold = 0;
+  int SGPRsInUse = 0;
+  int VGPRsInUse = 0;
+  for (const Use &A : CB->args()) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
+    for (auto ArgVT : ValueVTs) {
+      unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
+          CB->getContext(), CB->getCallingConv(), ArgVT);
+      if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))
+        SGPRsInUse += CCRegNum;
+      else
+        VGPRsInUse += CCRegNum;
+    }
+  }
+
+  // The cost of passing function arguments through the stack:
+  //  1 instruction to put a function argument on the stack in the caller.
+  //  1 instruction to take a function argument from the stack in callee.
+  //  1 instruction is explicitly take care of data dependencies in callee
+  //  function.
+  InstructionCost ArgStackCost(1);
+  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
+      Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
+      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
+  ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
+      Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
+      AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
+
+  // The penalty cost is computed relative to the cost of instructions and does
+  // not model any storage costs.
+  adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
+                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();
+  adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
+                     *ArgStackCost.getValue() * InlineConstants::getInstrCost();
+  return adjustThreshold;
+}
+
 unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
   // If we have a pointer to private array passed into a function
   // it will not be optimized out, leaving scratch usage.
   // Increase the inline threshold to allow inlining in this case.
+  unsigned adjustThreshold = 0;
   uint64_t AllocaSize = 0;
   SmallPtrSet<const AllocaInst *, 8> AIVisited;
   for (Value *PtrArg : CB->args()) {
@@ -1192,9 +1241,10 @@ unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
       }
     }
   }
-  if (AllocaSize)
-    return ArgAllocaCost;
-  return 0;
+  adjustThreshold +=
+      adjustInliningThresholdUsingCallee(CB, TLI, this);
+  adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize;
+  return adjustThreshold;
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2acd167..f066764 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -2481,10 +2481,35 @@ bool isArgPassedInSGPR(const Argument *A) {
   case CallingConv::AMDGPU_PS:
   case CallingConv::AMDGPU_CS:
   case CallingConv::AMDGPU_Gfx:
-    // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
-    // Everything else is in VGPRs.
-    return F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::InReg) ||
-           F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::ByVal);
+    // For non-compute shaders, SGPR inputs are marked with either inreg or
+    // byval. Everything else is in VGPRs.
+    return A->hasAttribute(Attribute::InReg) ||
+           A->hasAttribute(Attribute::ByVal);
+  default:
+    // TODO: Should calls support inreg for SGPR inputs?
+    return false;
+  }
+}
+
+bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
+  // Arguments to compute shaders are never a source of divergence.
+  CallingConv::ID CC = CB->getCallingConv();
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return true;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_LS:
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_ES:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_Gfx:
+    // For non-compute shaders, SGPR inputs are marked with either inreg or
+    // byval. Everything else is in VGPRs.
+    return CB->paramHasAttr(ArgNo, Attribute::InReg) ||
+           CB->paramHasAttr(ArgNo, Attribute::ByVal);
   default:
     // TODO: Should calls support inreg for SGPR inputs?
     return false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 96d8cb3..ef332e1 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -12,6 +12,7 @@
 #include "SIDefines.h"
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Alignment.h"
 #include <array>
@@ -1260,6 +1261,8 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
 bool isArgPassedInSGPR(const Argument *Arg);
 
+bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
+
 LLVM_READONLY
 bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
                                       int64_t EncodedOffset);
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll
new file mode 100644
index 0000000..af21cd6
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument-i64.ll
@@ -0,0 +1,100 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0) {
+entry:
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, 1
+  %xor16 = xor i64 %xor15, 2
+  ret i64 %xor16
+}
+
+define noundef i64 @inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) {
+entry:
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  %xor16 = xor i64 %xor15, 1
+  %xor17 = xor i64 %xor16, 1
+  ret i64 %xor17
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %arrayidx = getelementptr inbounds i64, ptr %in, i64 0
+  %a0 = load i64, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i64, ptr %in, i64 1
+  %b0 = load i64, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr %in, i64 2
+  %c0 = load i64, ptr %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i64, ptr %in, i64 3
+  %d0 = load i64, ptr %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i64, ptr %in, i64 4
+  %e0 = load i64, ptr %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i64, ptr %in, i64 5
+  %f0 = load i64, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i64, ptr %in, i64 6
+  %g0 = load i64, ptr %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i64, ptr %in, i64 7
+  %h0 = load i64, ptr %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i64, ptr %in, i64 8
+  %i0 = load i64, ptr %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i64, ptr %in, i64 9
+  %j0 = load i64, ptr %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i64, ptr %in, i64 10
+  %k0 = load i64, ptr %arrayidx10, align 4
+  %arrayidx11 = getelementptr inbounds i64, ptr %in, i64 11
+  %l0 = load i64, ptr %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds i64, ptr %in, i64 12
+  %m0 = load i64, ptr %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i64, ptr %in, i64 13
+  %n0 = load i64, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i64, ptr %in, i64 14
+  %o0 = load i64, ptr %arrayidx14, align 4
+  %arrayidx15 = getelementptr inbounds i64, ptr %in, i64 15
+  %p0 = load i64, ptr %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i64, ptr %in, i64 16
+  %q0 = load i64, ptr %arrayidx16, align 4
+  %noinlinecall1 = call noundef i64 @non_inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll
new file mode 100644
index 0000000..3f5af81
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-argument.ll
@@ -0,0 +1,164 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i32 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i32 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i32 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i32 @inlining_call
+
+define noundef i32 @non_inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1) {
+entry:
+  %xor = xor i32 %a0, %b0
+  %xor1 = xor i32 %xor, %c0
+  %xor2 = xor i32 %xor1, %d0
+  %xor3 = xor i32 %xor2, %e0
+  %xor4 = xor i32 %xor3, %f0
+  %xor5 = xor i32 %xor4, %g0
+  %xor6 = xor i32 %xor5, %h0
+  %xor7 = xor i32 %xor6, %i0
+  %xor8 = xor i32 %xor7, %j0
+  %xor9 = xor i32 %xor8, %k0
+  %xor10 = xor i32 %xor9, %l0
+  %xor11 = xor i32 %xor10, %m0
+  %xor12 = xor i32 %xor11, %n0
+  %xor13 = xor i32 %xor12, %o0
+  %xor14 = xor i32 %xor13, %p0
+  %xor15 = xor i32 %xor14, %q0
+  %xor16 = xor i32 %xor15, %r0
+  %xor17 = xor i32 %xor16, %s0
+  %xor18 = xor i32 %xor17, %t0
+  %xor19 = xor i32 %xor18, %u0
+  %xor20 = xor i32 %xor19, %v0
+  %xor21 = xor i32 %xor20, %w0
+  %xor22 = xor i32 %xor21, %x0
+  %xor23 = xor i32 %xor22, %y0
+  %xor24 = xor i32 %xor23, %z0
+  %xor25 = xor i32 %xor24, %a1
+  %xor26 = xor i32 %xor25, %b1
+  %xor27 = xor i32 %xor26, %c1
+  %xor28 = xor i32 %xor27, %d1
+  %xor29 = xor i32 %xor28, %e1
+  %xor30 = xor i32 %xor29, %f1
+  %xor31 = xor i32 %xor30, 1
+  %xor32 = xor i32 %xor31, 2
+  ret i32 %xor32
+}
+
+define noundef i32 @inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1, i32 noundef %g1) {
+entry:
+  %xor = xor i32 %a0, %b0
+  %xor1 = xor i32 %xor, %c0
+  %xor2 = xor i32 %xor1, %d0
+  %xor3 = xor i32 %xor2, %e0
+  %xor4 = xor i32 %xor3, %f0
+  %xor5 = xor i32 %xor4, %g0
+  %xor6 = xor i32 %xor5, %h0
+  %xor7 = xor i32 %xor6, %i0
+  %xor8 = xor i32 %xor7, %j0
+  %xor9 = xor i32 %xor8, %k0
+  %xor10 = xor i32 %xor9, %l0
+  %xor11 = xor i32 %xor10, %m0
+  %xor12 = xor i32 %xor11, %n0
+  %xor13 = xor i32 %xor12, %o0
+  %xor14 = xor i32 %xor13, %p0
+  %xor15 = xor i32 %xor14, %q0
+  %xor16 = xor i32 %xor15, %r0
+  %xor17 = xor i32 %xor16, %s0
+  %xor18 = xor i32 %xor17, %t0
+  %xor19 = xor i32 %xor18, %u0
+  %xor20 = xor i32 %xor19, %v0
+  %xor21 = xor i32 %xor20, %w0
+  %xor22 = xor i32 %xor21, %x0
+  %xor23 = xor i32 %xor22, %y0
+  %xor24 = xor i32 %xor23, %z0
+  %xor25 = xor i32 %xor24, %a1
+  %xor26 = xor i32 %xor25, %b1
+  %xor27 = xor i32 %xor26, %c1
+  %xor28 = xor i32 %xor27, %d1
+  %xor29 = xor i32 %xor28, %e1
+  %xor30 = xor i32 %xor29, %f1
+  %xor31 = xor i32 %xor30, %g1
+  %xor32 = xor i32 %xor30, 1
+  %xor33 = xor i32 %xor31, 2
+  ret i32 %xor33
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i32 @Caller(ptr noundef %in) {
+entry:
+  %arrayidx = getelementptr inbounds i32, ptr %in, i64 0
+  %a0 = load i32, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, ptr %in, i64 1
+  %b0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %in, i64 2
+  %c0 = load i32, ptr %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, ptr %in, i64 3
+  %d0 = load i32, ptr %arrayidx3, align 4
+  %arrayidx4 = getelementptr inbounds i32, ptr %in, i64 4
+  %e0 = load i32, ptr %arrayidx4, align 4
+  %arrayidx5 = getelementptr inbounds i32, ptr %in, i64 5
+  %f0 = load i32, ptr %arrayidx5, align 4
+  %arrayidx6 = getelementptr inbounds i32, ptr %in, i64 6
+  %g0 = load i32, ptr %arrayidx6, align 4
+  %arrayidx7 = getelementptr inbounds i32, ptr %in, i64 7
+  %h0 = load i32, ptr %arrayidx7, align 4
+  %arrayidx8 = getelementptr inbounds i32, ptr %in, i64 8
+  %i0 = load i32, ptr %arrayidx8, align 4
+  %arrayidx9 = getelementptr inbounds i32, ptr %in, i64 9
+  %j0 = load i32, ptr %arrayidx9, align 4
+  %arrayidx10 = getelementptr inbounds i32, ptr %in, i64 10
+  %k0 = load i32, ptr %arrayidx10, align 4
+  %arrayidx11 = getelementptr inbounds i32, ptr %in, i64 11
+  %l0 = load i32, ptr %arrayidx11, align 4
+  %arrayidx12 = getelementptr inbounds i32, ptr %in, i64 12
+  %m0 = load i32, ptr %arrayidx12, align 4
+  %arrayidx13 = getelementptr inbounds i32, ptr %in, i64 13
+  %n0 = load i32, ptr %arrayidx13, align 4
+  %arrayidx14 = getelementptr inbounds i32, ptr %in, i64 14
+  %o0 = load i32, ptr %arrayidx14, align 4
+  %arrayidx15 = getelementptr inbounds i32, ptr %in, i64 15
+  %p0 = load i32, ptr %arrayidx15, align 4
+  %arrayidx16 = getelementptr inbounds i32, ptr %in, i64 16
+  %q0 = load i32, ptr %arrayidx16, align 4
+  %arrayidx17 = getelementptr inbounds i32, ptr %in, i64 17
+  %r0 = load i32, ptr %arrayidx17, align 4
+  %arrayidx18 = getelementptr inbounds i32, ptr %in, i64 18
+  %s0 = load i32, ptr %arrayidx18, align 4
+  %arrayidx19 = getelementptr inbounds i32, ptr %in, i64 19
+  %t0 = load i32, ptr %arrayidx19, align 4
+  %arrayidx20 = getelementptr inbounds i32, ptr %in, i64 20
+  %u0 = load i32, ptr %arrayidx20, align 4
+  %arrayidx21 = getelementptr inbounds i32, ptr %in, i64 21
+  %v0 = load i32, ptr %arrayidx21, align 4
+  %arrayidx22 = getelementptr inbounds i32, ptr %in, i64 22
+  %w0 = load i32, ptr %arrayidx22, align 4
+  %arrayidx23 = getelementptr inbounds i32, ptr %in, i64 23
+  %x0 = load i32, ptr %arrayidx23, align 4
+  %arrayidx24 = getelementptr inbounds i32, ptr %in, i64 24
+  %y0 = load i32, ptr %arrayidx24, align 4
+  %arrayidx25 = getelementptr inbounds i32, ptr %in, i64 25
+  %z0 = load i32, ptr %arrayidx25, align 4
+  %arrayidx26 = getelementptr inbounds i32, ptr %in, i64 26
+  %a1 = load i32, ptr %arrayidx26, align 4
+  %arrayidx27 = getelementptr inbounds i32, ptr %in, i64 27
+  %b1 = load i32, ptr %arrayidx27, align 4
+  %arrayidx28 = getelementptr inbounds i32, ptr %in, i64 28
+  %c1 = load i32, ptr %arrayidx28, align 4
+  %arrayidx29 = getelementptr inbounds i32, ptr %in, i64 29
+  %d1 = load i32, ptr %arrayidx29, align 4
+  %arrayidx30 = getelementptr inbounds i32, ptr %in, i64 30
+  %e1 = load i32, ptr %arrayidx30, align 4
+  %arrayidx31 = getelementptr inbounds i32, ptr %in, i64 31
+  %f1 = load i32, ptr %arrayidx31, align 4
+  %arrayidx32 = getelementptr inbounds i32, ptr %in, i64 32
+  %g1 = load i32, ptr %arrayidx32, align 4
+  %noinlinecall1 = call noundef i32 @non_inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1)
+  %add = add i32 0, %noinlinecall1
+  %noinlinecall2 = call noundef i32 @non_inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1)
+  %add2 = add i32 %add, %noinlinecall2
+  %inlinecall1 = call noundef i32 @inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1, i32 noundef %g1)
+  %add3 = add i32 %add2, %inlinecall1
+  %inlinecall2 = call noundef i32 @inlining_call(i32 noundef %a0, i32 noundef %b0, i32 noundef %c0, i32 noundef %d0, i32 noundef %e0, i32 noundef %f0, i32 noundef %g0, i32 noundef %h0, i32 noundef %i0, i32 noundef %j0, i32 noundef %k0, i32 noundef %l0, i32 noundef %m0, i32 noundef %n0, i32 noundef %o0, i32 noundef %p0, i32 noundef %q0, i32 noundef %r0, i32 noundef %s0, i32 noundef %t0, i32 noundef %u0, i32 noundef %v0, i32 noundef %w0, i32 noundef %x0, i32 noundef %y0, i32 noundef %z0, i32 noundef %a1, i32 noundef %b1, i32 noundef %c1, i32 noundef %d1, i32 noundef %e1, i32 noundef %f1, i32 noundef %g1)
+  %add4 = add i32 %add3, %inlinecall2
+  ret i32 %add4
+}
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll
new file mode 100644
index 0000000..61a8ab6
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-array-ptr-argument.ll
@@ -0,0 +1,118 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call([2 x ptr] noundef %ptrarr, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0) {
+entry:
+  %ptra0 = extractvalue [2 x ptr] %ptrarr, 0
+  %ptrb0 = extractvalue [2 x ptr] %ptrarr, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call([2 x ptr] noundef %ptrarr, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0, ptr noundef %ptrq0) {
+entry:
+  %ptra0 = extractvalue [2 x ptr] %ptrarr, 0
+  %ptrb0 = extractvalue [2 x ptr] %ptrarr, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %a0 = getelementptr inbounds i64, ptr %in, i64 0
+  %b0 = getelementptr inbounds i64, ptr %in, i64 1
+  %arr0 = insertvalue [2 x ptr] undef, ptr %a0, 0
+  %arr1 = insertvalue [2 x ptr] %arr0, ptr %b0, 1
+  %c0 = getelementptr inbounds i64, ptr %in, i64 2
+  %d0 = getelementptr inbounds i64, ptr %in, i64 3
+  %e0 = getelementptr inbounds i64, ptr %in, i64 4
+  %f0 = getelementptr inbounds i64, ptr %in, i64 5
+  %g0 = getelementptr inbounds i64, ptr %in, i64 6
+  %h0 = getelementptr inbounds i64, ptr %in, i64 7
+  %i0 = getelementptr inbounds i64, ptr %in, i64 8
+  %j0 = getelementptr inbounds i64, ptr %in, i64 9
+  %k0 = getelementptr inbounds i64, ptr %in, i64 10
+  %l0 = getelementptr inbounds i64, ptr %in, i64 11
+  %m0 = getelementptr inbounds i64, ptr %in, i64 12
+  %n0 = getelementptr inbounds i64, ptr %in, i64 13
+  %o0 = getelementptr inbounds i64, ptr %in, i64 14
+  %p0 = getelementptr inbounds i64, ptr %in, i64 15
+  %q0 = getelementptr inbounds i64, ptr %in, i64 16
+  %noinlinecall1 = call noundef i64 @non_inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call([2 x ptr] noundef %arr1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll
new file mode 100644
index 0000000..8a44f7d
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-indirect-call-argument.ll
@@ -0,0 +1,21 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      Inlining (cost={{-+[0-9]+}}, threshold=330), Call:   call void @Dummy
+
+define void @Wrapper(ptr nocapture nofree noundef readonly %func, i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) {
+entry:
+  call void %func(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  ret void
+}
+
+define internal void @Dummy(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) {
+entry:
+  ret void
+}
+
+define void @Caller(i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0) minsize {
+entry:
+  call void @Wrapper(ptr noundef @Dummy, i64 noundef %a0, i64 noundef %b0, i64 noundef %c0, i64 noundef %d0, i64 noundef %e0, i64 noundef %f0, i64 noundef %g0, i64 noundef %h0, i64 noundef %i0, i64 noundef %j0, i64 noundef %k0, i64 noundef %l0, i64 noundef %m0, i64 noundef %n0, i64 noundef %o0, i64 noundef %p0, i64 noundef %q0)
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll
new file mode 100644
index 0000000..e03fc1c
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-ptr-argument.ll
@@ -0,0 +1,112 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call(ptr noundef %ptra0, ptr noundef %ptrb0, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0) {
+entry:
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call(ptr noundef %ptra0, ptr noundef %ptrb0, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0, ptr noundef %ptrq0) {
+entry:
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %a0 = getelementptr inbounds i64, ptr %in, i64 0
+  %b0 = getelementptr inbounds i64, ptr %in, i64 1
+  %c0 = getelementptr inbounds i64, ptr %in, i64 2
+  %d0 = getelementptr inbounds i64, ptr %in, i64 3
+  %e0 = getelementptr inbounds i64, ptr %in, i64 4
+  %f0 = getelementptr inbounds i64, ptr %in, i64 5
+  %g0 = getelementptr inbounds i64, ptr %in, i64 6
+  %h0 = getelementptr inbounds i64, ptr %in, i64 7
+  %i0 = getelementptr inbounds i64, ptr %in, i64 8
+  %j0 = getelementptr inbounds i64, ptr %in, i64 9
+  %k0 = getelementptr inbounds i64, ptr %in, i64 10
+  %l0 = getelementptr inbounds i64, ptr %in, i64 11
+  %m0 = getelementptr inbounds i64, ptr %in, i64 12
+  %n0 = getelementptr inbounds i64, ptr %in, i64 13
+  %o0 = getelementptr inbounds i64, ptr %in, i64 14
+  %p0 = getelementptr inbounds i64, ptr %in, i64 15
+  %q0 = getelementptr inbounds i64, ptr %in, i64 16
+  %noinlinecall1 = call noundef i64 @non_inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(ptr noundef %a0, ptr noundef %b0, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll
new file mode 100644
index 0000000..91c9ee1
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-struct-argument.ll
@@ -0,0 +1,171 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+%noinlineT =  type {{ptr, ptr}, ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64}
+%inlineT =    type {{ptr, ptr}, ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64}
+
+define noundef i64 @non_inlining_call(%noinlineT noundef %struc) {
+entry:
+  %ptra0 = extractvalue %noinlineT %struc, 0, 0
+  %ptrb0 = extractvalue %noinlineT %struc, 0, 1
+  %ptrc0 = extractvalue %noinlineT %struc, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = extractvalue %noinlineT %struc, 2
+  %e0 = extractvalue %noinlineT %struc, 3
+  %f0 = extractvalue %noinlineT %struc, 4
+  %g0 = extractvalue %noinlineT %struc, 5
+  %h0 = extractvalue %noinlineT %struc, 6
+  %i0 = extractvalue %noinlineT %struc, 7
+  %j0 = extractvalue %noinlineT %struc, 8
+  %k0 = extractvalue %noinlineT %struc, 9
+  %l0 = extractvalue %noinlineT %struc, 10
+  %m0 = extractvalue %noinlineT %struc, 11
+  %n0 = extractvalue %noinlineT %struc, 12
+  %o0 = extractvalue %noinlineT %struc, 13
+  %p0 = extractvalue %noinlineT %struc, 14
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call(%inlineT noundef %struc) {
+entry:
+  %ptra0 = extractvalue %inlineT %struc, 0, 0
+  %ptrb0 = extractvalue %inlineT %struc, 0, 1
+  %ptrc0 = extractvalue %inlineT %struc, 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = extractvalue %inlineT %struc, 2
+  %e0 = extractvalue %inlineT %struc, 3
+  %f0 = extractvalue %inlineT %struc, 4
+  %g0 = extractvalue %inlineT %struc, 5
+  %h0 = extractvalue %inlineT %struc, 6
+  %i0 = extractvalue %inlineT %struc, 7
+  %j0 = extractvalue %inlineT %struc, 8
+  %k0 = extractvalue %inlineT %struc, 9
+  %l0 = extractvalue %inlineT %struc, 10
+  %m0 = extractvalue %inlineT %struc, 11
+  %n0 = extractvalue %inlineT %struc, 12
+  %o0 = extractvalue %inlineT %struc, 13
+  %p0 = extractvalue %inlineT %struc, 14
+  %q0 = extractvalue %inlineT %struc, 15
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %ptra0 = getelementptr inbounds i64, ptr %in, i64 0
+  %ptrb0 = getelementptr inbounds i64, ptr %in, i64 1
+  %ptrc0 = getelementptr inbounds i64, ptr %in, i64 2
+  %ptrd0 = getelementptr inbounds i64, ptr %in, i64 3
+  %ptre0 = getelementptr inbounds i64, ptr %in, i64 4
+  %ptrf0 = getelementptr inbounds i64, ptr %in, i64 5
+  %ptrg0 = getelementptr inbounds i64, ptr %in, i64 6
+  %ptrh0 = getelementptr inbounds i64, ptr %in, i64 7
+  %ptri0 = getelementptr inbounds i64, ptr %in, i64 8
+  %ptrj0 = getelementptr inbounds i64, ptr %in, i64 9
+  %ptrk0 = getelementptr inbounds i64, ptr %in, i64 10
+  %ptrl0 = getelementptr inbounds i64, ptr %in, i64 11
+  %ptrm0 = getelementptr inbounds i64, ptr %in, i64 12
+  %ptrn0 = getelementptr inbounds i64, ptr %in, i64 13
+  %ptro0 = getelementptr inbounds i64, ptr %in, i64 14
+  %ptrp0 = getelementptr inbounds i64, ptr %in, i64 15
+  %ptrq0 = getelementptr inbounds i64, ptr %in, i64 16
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %noinlinestruc1 = insertvalue %noinlineT undef, ptr %ptra0, 0, 0
+  %noinlinestruc2 = insertvalue %noinlineT %noinlinestruc1, ptr %ptrb0, 0, 1
+  %noinlinestruc3 = insertvalue %noinlineT %noinlinestruc2, ptr %ptrc0, 1
+  %noinlinestruc4 = insertvalue %noinlineT %noinlinestruc3, i64 %d0, 2
+  %noinlinestruc5 = insertvalue %noinlineT %noinlinestruc4, i64 %e0, 3
+  %noinlinestruc6 = insertvalue %noinlineT %noinlinestruc5, i64 %f0, 4
+  %noinlinestruc7 = insertvalue %noinlineT %noinlinestruc6, i64 %g0, 5
+  %noinlinestruc8 = insertvalue %noinlineT %noinlinestruc7, i64 %h0, 6
+  %noinlinestruc9 = insertvalue %noinlineT %noinlinestruc8, i64 %i0, 7
+  %noinlinestruc10 = insertvalue %noinlineT %noinlinestruc9, i64 %j0, 8
+  %noinlinestruc11 = insertvalue %noinlineT %noinlinestruc10, i64 %k0, 9
+  %noinlinestruc12 = insertvalue %noinlineT %noinlinestruc11, i64 %l0, 10
+  %noinlinestruc13 = insertvalue %noinlineT %noinlinestruc12, i64 %m0, 11
+  %noinlinestruc14 = insertvalue %noinlineT %noinlinestruc13, i64 %n0, 12
+  %noinlinestruc15 = insertvalue %noinlineT %noinlinestruc14, i64 %o0, 13
+  %noinlinestruc16 = insertvalue %noinlineT %noinlinestruc15, i64 %p0, 14
+  %inlinestruc1 = insertvalue %inlineT undef, ptr %ptra0, 0, 0
+  %inlinestruc2 = insertvalue %inlineT %inlinestruc1, ptr %ptrb0, 0, 1
+  %inlinestruc3 = insertvalue %inlineT %inlinestruc2, ptr %ptrc0, 1
+  %inlinestruc4 = insertvalue %inlineT %inlinestruc3, i64 %d0, 2
+  %inlinestruc5 = insertvalue %inlineT %inlinestruc4, i64 %e0, 3
+  %inlinestruc6 = insertvalue %inlineT %inlinestruc5, i64 %f0, 4
+  %inlinestruc7 = insertvalue %inlineT %inlinestruc6, i64 %g0, 5
+  %inlinestruc8 = insertvalue %inlineT %inlinestruc7, i64 %h0, 6
+  %inlinestruc9 = insertvalue %inlineT %inlinestruc8, i64 %i0, 7
+  %inlinestruc10 = insertvalue %inlineT %inlinestruc9, i64 %j0, 8
+  %inlinestruc11 = insertvalue %inlineT %inlinestruc10, i64 %k0, 9
+  %inlinestruc12 = insertvalue %inlineT %inlinestruc11, i64 %l0, 10
+  %inlinestruc13 = insertvalue %inlineT %inlinestruc12, i64 %m0, 11
+  %inlinestruc14 = insertvalue %inlineT %inlinestruc13, i64 %n0, 12
+  %inlinestruc15 = insertvalue %inlineT %inlinestruc14, i64 %o0, 13
+  %inlinestruc16 = insertvalue %inlineT %inlinestruc15, i64 %p0, 14
+  %inlinestruc17 = insertvalue %inlineT %inlinestruc16, i64 %q0, 15
+  %noinlinecall1 = call noundef i64 @non_inlining_call(%noinlineT noundef %noinlinestruc16)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(%noinlineT noundef %noinlinestruc16)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(%inlineT noundef %inlinestruc17)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(%inlineT noundef %inlinestruc17)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll
new file mode 100644
index 0000000..bbb26a3
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-stack-vector-ptr-argument.ll
@@ -0,0 +1,118 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=inline -inline-cost-full=true -inline-threshold=0 -inline-instr-cost=5 -inline-call-penalty=0 -debug-only=inline < %s 2>&1 | FileCheck %s
+
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall1 = call noundef i64 @non_inlining_call
+; CHECK:      NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %noinlinecall2 = call noundef i64 @non_inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall1 = call noundef i64 @inlining_call
+; CHECK-NOT:  NOT Inlining (cost={{[0-9]+}}, threshold={{[0-9]+}}), Call:   %inlinecall2 = call noundef i64 @inlining_call
+
+define noundef i64 @non_inlining_call(<2 x ptr> noundef %ptrvec, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0) {
+entry:
+  %ptra0 = extractelement <2 x ptr> %ptrvec, i32 0
+  %ptrb0 = extractelement <2 x ptr> %ptrvec, i32 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  ret i64 %xor14
+}
+
+define noundef i64 @inlining_call(<2 x ptr> noundef %ptrvec, ptr noundef %ptrc0, ptr noundef %ptrd0, ptr noundef %ptre0, ptr noundef %ptrf0, ptr noundef %ptrg0, ptr noundef %ptrh0, ptr noundef %ptri0, ptr noundef %ptrj0, ptr noundef %ptrk0, ptr noundef %ptrl0, ptr noundef %ptrm0, ptr noundef %ptrn0, ptr noundef %ptro0, ptr noundef %ptrp0, ptr noundef %ptrq0) {
+entry:
+  %ptra0 = extractelement <2 x ptr> %ptrvec, i32 0
+  %ptrb0 = extractelement <2 x ptr> %ptrvec, i32 1
+  %a0 = load i64, ptr %ptra0, align 8
+  %b0 = load i64, ptr %ptrb0, align 8
+  %c0 = load i64, ptr %ptrc0, align 8
+  %d0 = load i64, ptr %ptrd0, align 8
+  %e0 = load i64, ptr %ptre0, align 8
+  %f0 = load i64, ptr %ptrf0, align 8
+  %g0 = load i64, ptr %ptrg0, align 8
+  %h0 = load i64, ptr %ptrh0, align 8
+  %i0 = load i64, ptr %ptri0, align 8
+  %j0 = load i64, ptr %ptrj0, align 8
+  %k0 = load i64, ptr %ptrk0, align 8
+  %l0 = load i64, ptr %ptrl0, align 8
+  %m0 = load i64, ptr %ptrm0, align 8
+  %n0 = load i64, ptr %ptrn0, align 8
+  %o0 = load i64, ptr %ptro0, align 8
+  %p0 = load i64, ptr %ptrp0, align 8
+  %q0 = load i64, ptr %ptrq0, align 8
+  %xor = xor i64 %a0, %b0
+  %xor1 = xor i64 %xor, %c0
+  %xor2 = xor i64 %xor1, %d0
+  %xor3 = xor i64 %xor2, %e0
+  %xor4 = xor i64 %xor3, %f0
+  %xor5 = xor i64 %xor4, %g0
+  %xor6 = xor i64 %xor5, %h0
+  %xor7 = xor i64 %xor6, %i0
+  %xor8 = xor i64 %xor7, %j0
+  %xor9 = xor i64 %xor8, %k0
+  %xor10 = xor i64 %xor9, %l0
+  %xor11 = xor i64 %xor10, %m0
+  %xor12 = xor i64 %xor11, %n0
+  %xor13 = xor i64 %xor12, %o0
+  %xor14 = xor i64 %xor13, %p0
+  %xor15 = xor i64 %xor14, %q0
+  ret i64 %xor15
+}
+
+; Calling each (non-)inlining function twice to make sure they won't get the sole call inlining cost bonus. 
+define i64 @Caller(ptr noundef %in) {
+entry:
+  %a0 = getelementptr inbounds i64, ptr %in, i64 0
+  %b0 = getelementptr inbounds i64, ptr %in, i64 1
+  %vec0 = insertelement <2 x ptr> undef, ptr %a0, i32 0
+  %vec1 = insertelement <2 x ptr> %vec0, ptr %b0, i32 0
+  %c0 = getelementptr inbounds i64, ptr %in, i64 2
+  %d0 = getelementptr inbounds i64, ptr %in, i64 3
+  %e0 = getelementptr inbounds i64, ptr %in, i64 4
+  %f0 = getelementptr inbounds i64, ptr %in, i64 5
+  %g0 = getelementptr inbounds i64, ptr %in, i64 6
+  %h0 = getelementptr inbounds i64, ptr %in, i64 7
+  %i0 = getelementptr inbounds i64, ptr %in, i64 8
+  %j0 = getelementptr inbounds i64, ptr %in, i64 9
+  %k0 = getelementptr inbounds i64, ptr %in, i64 10
+  %l0 = getelementptr inbounds i64, ptr %in, i64 11
+  %m0 = getelementptr inbounds i64, ptr %in, i64 12
+  %n0 = getelementptr inbounds i64, ptr %in, i64 13
+  %o0 = getelementptr inbounds i64, ptr %in, i64 14
+  %p0 = getelementptr inbounds i64, ptr %in, i64 15
+  %q0 = getelementptr inbounds i64, ptr %in, i64 16
+  %noinlinecall1 = call noundef i64 @non_inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add = add i64 0, %noinlinecall1
+  %noinlinecall2 = call noundef i64 @non_inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0)
+  %add2 = add i64 %add, %noinlinecall2
+  %inlinecall1 = call noundef i64 @inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add3 = add i64 %add2, %inlinecall1
+  %inlinecall2 = call noundef i64 @inlining_call(<2 x ptr> noundef %vec1, ptr noundef %c0, ptr noundef %d0, ptr noundef %e0, ptr noundef %f0, ptr noundef %g0, ptr noundef %h0, ptr noundef %i0, ptr noundef %j0, ptr noundef %k0, ptr noundef %l0, ptr noundef %m0, ptr noundef %n0, ptr noundef %o0, ptr noundef %p0, ptr noundef %q0)
+  %add4 = add i64 %add3, %inlinecall2
+  ret i64 %add4
+}
-- 
2.7.4