#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#define DEBUG_TYPE "vector-combine"
STATISTIC(NumVecCmp, "Number of vector compares formed");
+STATISTIC(NumVecBO, "Number of vector binops formed");
static bool foldExtractCmp(Instruction &I, const TargetTransformInfo &TTI) {
// Match a cmp with extracted vector operands.
return true;
}
+/// Try to reduce extract element costs by converting scalar binops to vector
+/// binops followed by extract.
+static bool foldExtractBinop(Instruction &I, const TargetTransformInfo &TTI) {
+ // It is not safe to transform things like div, urem, etc. because we may
+ // create undefined behavior when executing those on unknown vector elements.
+ if (!isSafeToSpeculativelyExecute(&I))
+ return false;
+
+ // Match a scalar binop with extracted vector operands:
+ // bo (extelt X, C0), (extelt Y, C1)
+ Instruction *Ext0, *Ext1;
+ if (!match(&I, m_BinOp(m_Instruction(Ext0), m_Instruction(Ext1))))
+ return false;
+
+ Value *X, *Y;
+ uint64_t C0, C1;
+ if (!match(Ext0, m_ExtractElement(m_Value(X), m_ConstantInt(C0))) ||
+ !match(Ext1, m_ExtractElement(m_Value(Y), m_ConstantInt(C1))) ||
+ X->getType() != Y->getType())
+ return false;
+
+ // Check if using a vector binop would be cheaper.
+ Instruction::BinaryOps BOpcode = cast<BinaryOperator>(I).getOpcode();
+ Type *ScalarTy = I.getType();
+ Type *VecTy = X->getType();
+ int ScalarBOCost = TTI.getArithmeticInstrCost(BOpcode, ScalarTy);
+ int VecBOCost = TTI.getArithmeticInstrCost(BOpcode, VecTy);
+ int Extract0Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+ VecTy, C0);
+ int Extract1Cost = TTI.getVectorInstrCost(Instruction::ExtractElement,
+ VecTy, C1);
+
+ // Handle a special case - if the extract indexes are the same, the
+ // replacement sequence does not require a shuffle. Unless the vector binop is
+ // much more expensive than the scalar binop, this eliminates an extract.
+ // Extra uses of the extracts mean that we include those costs in the
+ // vector total because those instructions will not be eliminated.
+ if (C0 == C1) {
+ assert(Extract0Cost == Extract1Cost && "Different costs for same extract?");
+ int ExtractCost = Extract0Cost;
+ if (X != Y) {
+ int ScalarCost = ExtractCost + ExtractCost + ScalarBOCost;
+ int VecCost = VecBOCost + ExtractCost +
+ !Ext0->hasOneUse() * ExtractCost +
+ !Ext1->hasOneUse() * ExtractCost;
+ if (ScalarCost <= VecCost)
+ return false;
+ } else {
+ // Handle an extra-special case. If the 2 binop operands are identical,
+ // adjust the formulas to account for that:
+ // bo (extelt X, C), (extelt X, C) --> extelt (bo X, X), C
+ // The extra use charge allows for either the CSE'd pattern or an
+ // unoptimized form with identical values.
+ bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
+ : !Ext0->hasOneUse() || !Ext1->hasOneUse();
+ int ScalarCost = ExtractCost + ScalarBOCost;
+ int VecCost = VecBOCost + ExtractCost + HasUseTax * ExtractCost;
+ if (ScalarCost <= VecCost)
+ return false;
+ }
+
+ // bo (extelt X, C), (extelt Y, C) --> extelt (bo X, Y), C
+ ++NumVecBO;
+ IRBuilder<> Builder(&I);
+ Value *NewBO = Builder.CreateBinOp(BOpcode, X, Y);
+ if (auto *VecBOInst = dyn_cast<Instruction>(NewBO)) {
+ // All IR flags are safe to back-propagate because any potential poison
+ // created in unused vector elements is discarded by the extract.
+ VecBOInst->copyIRFlags(&I);
+ }
+ Value *Extract = Builder.CreateExtractElement(NewBO, Ext0->getOperand(1));
+ I.replaceAllUsesWith(Extract);
+ return true;
+ }
+
+ // TODO: Handle C0 != C1 by shuffling 1 of the operands.
+ return false;
+}
+
/// This is the entry point for all transforms. Pass manager differences are
/// handled in the callers of this function.
static bool runImpl(Function &F, const TargetTransformInfo &TTI,
// iteratively in this loop rather than waiting until the end.
for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
MadeChange |= foldExtractCmp(I, TTI);
- // TODO: More transforms go here.
+ MadeChange |= foldExtractBinop(I, TTI);
}
}
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- | FileCheck %s
+; Eliminating extract is profitable.
+
define i8 @ext0_ext0_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext0_ext0_add(
-; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
-; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
-; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
-; CHECK-NEXT: ret i8 [[R]]
+; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
+; CHECK-NEXT: ret i8 [[TMP2]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 0
ret i8 %r
}
+; Eliminating extract is still profitable. Flags propagate.
+
define i8 @ext1_ext1_add_flags(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_add_flags(
-; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
-; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
-; CHECK-NEXT: [[R:%.*]] = add nuw nsw i8 [[E0]], [[E1]]
-; CHECK-NEXT: ret i8 [[R]]
+; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <16 x i8> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
+; CHECK-NEXT: ret i8 [[TMP2]]
;
%e0 = extractelement <16 x i8> %x, i32 1
%e1 = extractelement <16 x i8> %y, i32 1
ret i8 %r
}
+; Negative test - eliminating extract is profitable, but vector shift is expensive.
+
define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_shl(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
ret i8 %r
}
+; Negative test - eliminating extract is profitable, but vector multiply is expensive.
+
define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext13_ext13_mul(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
ret i8 %r
}
+; Negative test - cost is irrelevant because sdiv has potential UB.
+
define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext0_ext0_sdiv(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
ret i8 %r
}
+; Negative test - extracts are free and vector op has same cost as scalar.
+
define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext0_ext0_fadd(
; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 0
ret double %r
}
+; Eliminating extract is profitable. Flags propagate.
+
define double @ext1_ext1_fsub(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext1_ext1_fsub(
-; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT: [[E1:%.*]] = extractelement <2 x double> [[Y:%.*]], i32 1
-; CHECK-NEXT: [[R:%.*]] = fsub fast double [[E0]], [[E1]]
-; CHECK-NEXT: ret double [[R]]
+; CHECK-NEXT: [[TMP1:%.*]] = fsub fast <2 x double> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
+; CHECK-NEXT: ret double [[TMP2]]
;
%e0 = extractelement <2 x double> %x, i32 1
%e1 = extractelement <2 x double> %y, i32 1
ret double %r
}
+; Negative test - type mismatch.
+
define double @ext1_ext1_fadd_different_types(<2 x double> %x, <4 x double> %y) {
; CHECK-LABEL: @ext1_ext1_fadd_different_types(
; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
ret double %r
}
+; Negative test - disguised same vector operand; scalar code is cheaper than general case.
+
define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
ret i32 %r
}
+; Negative test - same vector operand; scalar code is cheaper than general case.
+
define i32 @ext1_ext1_add_same_vec_cse(<4 x i32> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_cse(
; CHECK-NEXT: [[E0:%.*]] = extractelement <4 x i32> [[X:%.*]], i32 1
declare void @use_i8(i8)
+; Negative test - same vector operand; scalar code is cheaper than general case
+; and vector code would be more expensive still.
+
define i8 @ext1_ext1_add_same_vec_extra_use0(<16 x i8> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use0(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
ret i8 %r
}
+; Negative test - same vector operand; scalar code is cheaper than general case
+; and vector code would be more expensive still.
+
define i8 @ext1_ext1_add_same_vec_extra_use1(<16 x i8> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use1(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
ret i8 %r
}
+; Negative test - same vector operand; scalar code is cheaper than general case
+; and vector code would be more expensive still.
+
define i8 @ext1_ext1_add_same_vec_cse_extra_use(<16 x i8> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_cse_extra_use(
; CHECK-NEXT: [[E:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
ret i8 %r
}
+; Negative test - vector code would not be cheaper.
+
define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_add_uses1(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
ret i8 %r
}
+; Negative test - vector code would not be cheaper.
+
define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_add_uses2(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
ret i8 %r
}
+; TODO: Different extract indexes requires a shuffle.
+
define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext0_ext1_add(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0