From e4a9190ad7b20a12476e3cccc30e842c0a93f1bb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@intel.com>
Date: Fri, 24 Apr 2020 15:46:31 -0700
Subject: [PATCH] [X86][ArgumentPromotion] Allow Argument Promotion if caller
 and callee disagree on 512-bit vectors support if the arguments are scalar.

If one of caller/callee has disabled ZMM registers due to
prefer-vector-width=256, we were previously
disabling argument promotion as the ABI might be incompatible since
one side will split 512-bit vectors in this case.

But if we can see that the types are all scalar this shouldn't be
a problem.

This patch assumes that pointer element type reflects the type that
the argument will be promoted to.

Differential Revision: https://reviews.llvm.org/D78770
---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp     | 17 +++++--
 .../X86/min-legal-vector-width.ll                  | 58 ++++++++++++++++++++++
 2 files changed, 72 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 79458eb..868e889 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3845,11 +3845,22 @@ bool X86TTIImpl::areFunctionArgsABICompatible(
   // If we get here, we know the target features match. If one function
   // considers 512-bit vectors legal and the other does not, consider them
   // incompatible.
-  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
   const TargetMachine &TM = getTLI()->getTargetMachine();
 
-  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
-         TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
+  if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
+      TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
+    return true;
+
+  // Consider the arguments compatible if they aren't vectors or aggregates.
+  // FIXME: Look at the size of vectors.
+  // FIXME: Look at the element types of aggregates to see if there are vectors.
+  // FIXME: The API of this function seems intended to allow arguments
+  // to be removed from the set, but the caller doesn't check if the set
+  // becomes empty so that may not work in practice.
+  return llvm::none_of(Args, [](Argument *A) {
+    auto *EltTy = cast<PointerType>(A->getType())->getElementType();
+    return EltTy->isVectorTy() || EltTy->isAggregateType();
+  });
 }
 
 X86TTIImpl::TTI::MemCmpExpansionOptions
diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll
index 6cca815..60aaf0d 100644
--- a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll
+++ b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll
@@ -318,6 +318,64 @@ bb:
   ret void
 }
 
+; If the arguments are scalar, its ok to promote.
+define internal i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32* %X, i32* %Y) #2 {
+; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = load i32, i32* %X
+  %B = load i32, i32* %Y
+  %C = add i32 %A, %B
+  ret i32 %C
+}
+
+define i32 @scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32* %B) #2 {
+; CHECK-LABEL: define {{[^@]+}}@scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256
+; CHECK-SAME: (i32* [[B:%.*]])
+; CHECK-NEXT:    [[A:%.*]] = alloca i32
+; CHECK-NEXT:    store i32 1, i32* [[A]]
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, i32* [[A]]
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, i32* [[B]]
+; CHECK-NEXT:    [[C:%.*]] = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32 [[A_VAL]], i32 [[B_VAL]])
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = alloca i32
+  store i32 1, i32* %A
+  %C = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32* %A, i32* %B)
+  ret i32 %C
+}
+
+; If the arguments are scalar, its ok to promote.
+define internal i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32* %X, i32* %Y) #2 {
+; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]])
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]]
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = load i32, i32* %X
+  %B = load i32, i32* %Y
+  %C = add i32 %A, %B
+  ret i32 %C
+}
+
+define i32 @scalar_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32* %B) #2 {
+; CHECK-LABEL: define {{[^@]+}}@scalar_avx512_legal512_prefer256_call_avx512_legal256_prefer256
+; CHECK-SAME: (i32* [[B:%.*]])
+; CHECK-NEXT:    [[A:%.*]] = alloca i32
+; CHECK-NEXT:    store i32 1, i32* [[A]]
+; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, i32* [[A]]
+; CHECK-NEXT:    [[B_VAL:%.*]] = load i32, i32* [[B]]
+; CHECK-NEXT:    [[C:%.*]] = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32 [[A_VAL]], i32 [[B_VAL]])
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %A = alloca i32
+  store i32 1, i32* %A
+  %C = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32* %A, i32* %B)
+  ret i32 %C
+}
+
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
 
-- 
2.7.4