From e4a9190ad7b20a12476e3cccc30e842c0a93f1bb Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 24 Apr 2020 15:46:31 -0700 Subject: [PATCH] [X86][ArgumentPromotion] Allow Argument Promotion if caller and callee disagree on 512-bit vectors support if the arguments are scalar. If one of caller/callee has disabled ZMM registers due to prefer-vector-width=256, we were previously disabling argument promotion as the ABI might be incompatible since one side will split 512-bit vectors in this case. But if we can see that the types are all scalar this shouldn't be a problem. This patch assumes that pointer element type reflects the type that the argument will be promoted to. Differential Revision: https://reviews.llvm.org/D78770 --- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 17 +++++-- .../X86/min-legal-vector-width.ll | 58 ++++++++++++++++++++++ 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 79458eb..868e889 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -3845,11 +3845,22 @@ bool X86TTIImpl::areFunctionArgsABICompatible( // If we get here, we know the target features match. If one function // considers 512-bit vectors legal and the other does not, consider them // incompatible. - // FIXME Look at the arguments and only consider 512 bit or larger vectors? const TargetMachine &TM = getTLI()->getTargetMachine(); - return TM.getSubtarget(*Caller).useAVX512Regs() == - TM.getSubtarget(*Callee).useAVX512Regs(); + if (TM.getSubtarget(*Caller).useAVX512Regs() == + TM.getSubtarget(*Callee).useAVX512Regs()) + return true; + + // Consider the arguments compatible if they aren't vectors or aggregates. + // FIXME: Look at the size of vectors. + // FIXME: Look at the element types of aggregates to see if there are vectors. + // FIXME: The API of this function seems intended to allow arguments + // to be removed from the set, but the caller doesn't check if the set + // becomes empty so that may not work in practice. + return llvm::none_of(Args, [](Argument *A) { + auto *EltTy = cast(A->getType())->getElementType(); + return EltTy->isVectorTy() || EltTy->isAggregateType(); + }); } X86TTIImpl::TTI::MemCmpExpansionOptions diff --git a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll index 6cca815..60aaf0d 100644 --- a/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll +++ b/llvm/test/Transforms/ArgumentPromotion/X86/min-legal-vector-width.ll @@ -318,6 +318,64 @@ bb: ret void } +; If the arguments are scalar, its ok to promote. +define internal i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32* %X, i32* %Y) #2 { +; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]]) +; CHECK-NEXT: [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]] +; CHECK-NEXT: ret i32 [[C]] +; + %A = load i32, i32* %X + %B = load i32, i32* %Y + %C = add i32 %A, %B + ret i32 %C +} + +define i32 @scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32* %B) #2 { +; CHECK-LABEL: define {{[^@]+}}@scalar_avx512_legal256_prefer256_call_avx512_legal512_prefer256 +; CHECK-SAME: (i32* [[B:%.*]]) +; CHECK-NEXT: [[A:%.*]] = alloca i32 +; CHECK-NEXT: store i32 1, i32* [[A]] +; CHECK-NEXT: [[A_VAL:%.*]] = load i32, i32* [[A]] +; CHECK-NEXT: [[B_VAL:%.*]] = load i32, i32* [[B]] +; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32 [[A_VAL]], i32 [[B_VAL]]) +; CHECK-NEXT: ret i32 [[C]] +; + %A = alloca i32 + store i32 1, i32* %A + %C = call i32 @scalar_callee_avx512_legal256_prefer256_call_avx512_legal512_prefer256(i32* %A, i32* %B) + ret i32 %C +} + +; If the arguments are scalar, its ok to promote. +define internal i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32* %X, i32* %Y) #2 { +; CHECK-LABEL: define {{[^@]+}}@scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; CHECK-SAME: (i32 [[X_VAL:%.*]], i32 [[Y_VAL:%.*]]) +; CHECK-NEXT: [[C:%.*]] = add i32 [[X_VAL]], [[Y_VAL]] +; CHECK-NEXT: ret i32 [[C]] +; + %A = load i32, i32* %X + %B = load i32, i32* %Y + %C = add i32 %A, %B + ret i32 %C +} + +define i32 @scalar_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32* %B) #2 { +; CHECK-LABEL: define {{[^@]+}}@scalar_avx512_legal512_prefer256_call_avx512_legal256_prefer256 +; CHECK-SAME: (i32* [[B:%.*]]) +; CHECK-NEXT: [[A:%.*]] = alloca i32 +; CHECK-NEXT: store i32 1, i32* [[A]] +; CHECK-NEXT: [[A_VAL:%.*]] = load i32, i32* [[A]] +; CHECK-NEXT: [[B_VAL:%.*]] = load i32, i32* [[B]] +; CHECK-NEXT: [[C:%.*]] = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32 [[A_VAL]], i32 [[B_VAL]]) +; CHECK-NEXT: ret i32 [[C]] +; + %A = alloca i32 + store i32 1, i32* %A + %C = call i32 @scalar_callee_avx512_legal512_prefer256_call_avx512_legal256_prefer256(i32* %A, i32* %B) + ret i32 %C +} + ; Function Attrs: argmemonly nounwind declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5 -- 2.7.4