#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Use.h"
#include "llvm/IR/Value.h"
#include <vector>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "aarch64-lower"
return true;
}
+/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
+/// or upper half of the vector elements.
+static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
+ auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
+ auto *FullVT = cast<VectorType>(FullV->getType());
+ auto *HalfVT = cast<VectorType>(HalfV->getType());
+ return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
+ };
+
+ auto extractHalf = [](Value *FullV, Value *HalfV) {
+ auto *FullVT = cast<VectorType>(FullV->getType());
+ auto *HalfVT = cast<VectorType>(HalfV->getType());
+ return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
+ };
+
+ Constant *M1, *M2;
+ Value *S1Op1, *S2Op1;
+ if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) ||
+ !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
+ return false;
+
+ // Check that the operands are half as wide as the result and we extract
+ // half of the elements of the input vectors.
+ if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
+ !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
+ return false;
+
+ // Check the mask extracts either the lower or upper half of vector
+ // elements.
+ int M1Start = -1;
+ int M2Start = -1;
+ int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
+ if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
+ !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
+ M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
+ return false;
+
+ return true;
+}
+
+/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
+/// of the vector elements.
+static bool areExtractExts(Value *Ext1, Value *Ext2) {
+ auto areExtDoubled = [](Instruction *Ext) {
+ return Ext->getType()->getScalarSizeInBits() ==
+ 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
+ };
+
+ if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
+ !match(Ext2, m_ZExtOrSExt(m_Value())) ||
+ !areExtDoubled(cast<Instruction>(Ext1)) ||
+ !areExtDoubled(cast<Instruction>(Ext2)))
+ return false;
+
+ return true;
+}
+
+/// Check if sinking \p I's operands to I's basic block is profitable, because
+/// the operands can be folded into a target instruction, e.g.
+/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
+bool AArch64TargetLowering::shouldSinkOperands(
+ Instruction *I, SmallVectorImpl<Use *> &Ops) const {
+ if (!I->getType()->isVectorTy())
+ return false;
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::aarch64_neon_umull:
+ if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
+ return false;
+ Ops.push_back(&II->getOperandUse(0));
+ Ops.push_back(&II->getOperandUse(1));
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ switch (I->getOpcode()) {
+ case Instruction::Sub:
+ case Instruction::Add: {
+ if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
+ return false;
+
+ // If the exts' operands extract either the lower or upper elements, we
+ // can sink them too.
+ auto Ext1 = cast<Instruction>(I->getOperand(0));
+ auto Ext2 = cast<Instruction>(I->getOperand(1));
+ if (areExtractShuffleVectors(Ext1, Ext2)) {
+ Ops.push_back(&Ext1->getOperandUse(0));
+ Ops.push_back(&Ext2->getOperandUse(0));
+ }
+
+ Ops.push_back(&I->getOperandUse(0));
+ Ops.push_back(&I->getOperandUse(1));
+
+ return true;
+ }
+ default:
+ return false;
+ }
+ return false;
+}
+
bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
unsigned &RequiredAligment) const {
if (!LoadedType.isSimple() ||
--- /dev/null
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -codegenprepare -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown"
+
+define <8 x i16> @sink_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @sink_zext(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ZB_1:%.*]] = zext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT: [[TMP0:%.*]] = zext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT: ret <8 x i16> [[RES_1]]
+; CHECK: if.else:
+; CHECK-NEXT: [[ZB_2:%.*]] = zext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
+; CHECK-NEXT: ret <8 x i16> [[RES_2]]
+;
+entry:
+ %za = zext <8 x i8> %a to <8 x i16>
+ br i1 %c, label %if.then, label %if.else
+
+if.then:
+ %zb.1 = zext <8 x i8> %b to <8 x i16>
+ %res.1 = add <8 x i16> %za, %zb.1
+ ret <8 x i16> %res.1
+
+if.else:
+ %zb.2 = zext <8 x i8> %b to <8 x i16>
+ %res.2 = sub <8 x i16> %za, %zb.2
+ ret <8 x i16> %res.2
+}
+
+define <8 x i16> @sink_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @sink_sext(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT: ret <8 x i16> [[RES_1]]
+; CHECK: if.else:
+; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[A]] to <8 x i16>
+; CHECK-NEXT: [[RES_2:%.*]] = sub <8 x i16> [[TMP1]], [[ZB_2]]
+; CHECK-NEXT: ret <8 x i16> [[RES_2]]
+;
+entry:
+ %za = sext <8 x i8> %a to <8 x i16>
+ br i1 %c, label %if.then, label %if.else
+
+if.then:
+ %zb.1 = sext <8 x i8> %b to <8 x i16>
+ %res.1 = add <8 x i16> %za, %zb.1
+ ret <8 x i16> %res.1
+
+if.else:
+ %zb.2 = sext <8 x i8> %b to <8 x i16>
+ %res.2 = sub <8 x i16> %za, %zb.2
+ ret <8 x i16> %res.2
+}
+
+define <8 x i16> @do_not_sink_nonfree_zext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @do_not_sink_nonfree_zext(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT: ret <8 x i16> [[RES_1]]
+; CHECK: if.else:
+; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT: ret <8 x i16> [[ZB_2]]
+;
+entry:
+ %za = sext <8 x i8> %a to <8 x i16>
+ br i1 %c, label %if.then, label %if.else
+
+if.then:
+ %zb.1 = sext <8 x i8> %b to <8 x i16>
+ %res.1 = add <8 x i16> %za, %zb.1
+ ret <8 x i16> %res.1
+
+if.else:
+ %zb.2 = sext <8 x i8> %b to <8 x i16>
+ ret <8 x i16> %zb.2
+}
+
+define <8 x i16> @do_not_sink_nonfree_sext(<8 x i8> %a, <8 x i8> %b, i1 %c) {
+; CHECK-LABEL: @do_not_sink_nonfree_sext(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[ZB_1:%.*]] = sext <8 x i8> [[B:%.*]] to <8 x i16>
+; CHECK-NEXT: [[TMP0:%.*]] = sext <8 x i8> [[A:%.*]] to <8 x i16>
+; CHECK-NEXT: [[RES_1:%.*]] = add <8 x i16> [[TMP0]], [[ZB_1]]
+; CHECK-NEXT: ret <8 x i16> [[RES_1]]
+; CHECK: if.else:
+; CHECK-NEXT: [[ZB_2:%.*]] = sext <8 x i8> [[B]] to <8 x i16>
+; CHECK-NEXT: ret <8 x i16> [[ZB_2]]
+;
+entry:
+ %za = sext <8 x i8> %a to <8 x i16>
+ br i1 %c, label %if.then, label %if.else
+
+if.then:
+ %zb.1 = sext <8 x i8> %b to <8 x i16>
+ %res.1 = add <8 x i16> %za, %zb.1
+ ret <8 x i16> %res.1
+
+if.else:
+ %zb.2 = sext <8 x i8> %b to <8 x i16>
+ ret <8 x i16> %zb.2
+}
+
+; The masks used are suitable for umull, sink shufflevector to users.
+define <8 x i16> @sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @sink_shufflevector_umull(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP0]], <8 x i8> [[S2]])
+; CHECK-NEXT: ret <8 x i16> [[VMULL0]]
+; CHECK: if.else:
+; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[TMP1]], <8 x i8> [[S4]])
+; CHECK-NEXT: ret <8 x i16> [[VMULL1]]
+;
+entry:
+ %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+ ret <8 x i16> %vmull0
+
+if.else:
+ %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+ ret <8 x i16> %vmull1
+}
+
+; Both exts and their shufflevector operands can be sunk.
+define <8 x i16> @sink_shufflevector_ext_subadd(<16 x i8> %a, <16 x i8> %b) {
+entry:
+ %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %z1 = zext <8 x i8> %s1 to <8 x i16>
+ %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %z3 = sext <8 x i8> %s3 to <8 x i16>
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %z2 = zext <8 x i8> %s2 to <8 x i16>
+ %res1 = add <8 x i16> %z1, %z2
+ ret <8 x i16> %res1
+
+if.else:
+ %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %z4 = sext <8 x i8> %s4 to <8 x i16>
+ %res2 = sub <8 x i16> %z3, %z4
+ ret <8 x i16> %res2
+}
+
+
+declare void @user1(<8 x i16>)
+
+; Both exts and their shufflevector operands can be sunk.
+define <8 x i16> @sink_shufflevector_ext_subadd_multiuse(<16 x i8> %a, <16 x i8> %b) {
+entry:
+ %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %z1 = zext <8 x i8> %s1 to <8 x i16>
+ %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %z3 = sext <8 x i8> %s3 to <8 x i16>
+ call void @user1(<8 x i16> %z3)
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %z2 = zext <8 x i8> %s2 to <8 x i16>
+ %res1 = add <8 x i16> %z1, %z2
+ ret <8 x i16> %res1
+
+if.else:
+ %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ %z4 = sext <8 x i8> %s4 to <8 x i16>
+ %res2 = sub <8 x i16> %z3, %z4
+ ret <8 x i16> %res2
+}
+
+
+; The masks used are not suitable for umull, do not sink.
+define <8 x i16> @no_sink_shufflevector_umull(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @no_sink_shufflevector_umull(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[S1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[S3:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK: if.then:
+; CHECK-NEXT: [[S2:%.*]] = shufflevector <16 x i8> [[B:%.*]], <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[VMULL0:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[S1]], <8 x i8> [[S2]])
+; CHECK-NEXT: ret <8 x i16> [[VMULL0]]
+; CHECK: if.else:
+; CHECK-NEXT: [[S4:%.*]] = shufflevector <16 x i8> [[B]], <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 10, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[VMULL1:%.*]] = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> [[S3]], <8 x i8> [[S4]])
+; CHECK-NEXT: ret <8 x i16> [[VMULL1]]
+;
+entry:
+ %s1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 5, i32 6, i32 7>
+ %s3 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ br i1 undef, label %if.then, label %if.else
+
+if.then:
+ %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+ ret <8 x i16> %vmull0
+
+if.else:
+ %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 10, i32 12, i32 13, i32 14, i32 15>
+ %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+ ret <8 x i16> %vmull1
+}
+
+
+; Function Attrs: nounwind readnone
+declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>) #2