1 //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This file implements the AArch64TargetLowering class.
11 //===----------------------------------------------------------------------===//
13 #include "AArch64ISelLowering.h"
14 #include "AArch64CallingConvention.h"
15 #include "AArch64ExpandImm.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64PerfectShuffle.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "MCTargetDesc/AArch64AddressingModes.h"
21 #include "Utils/AArch64BaseInfo.h"
22 #include "llvm/ADT/APFloat.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ArrayRef.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/SmallVector.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/ADT/StringRef.h"
30 #include "llvm/ADT/Triple.h"
31 #include "llvm/ADT/Twine.h"
32 #include "llvm/Analysis/MemoryLocation.h"
33 #include "llvm/Analysis/ObjCARCUtil.h"
34 #include "llvm/Analysis/TargetTransformInfo.h"
35 #include "llvm/Analysis/VectorUtils.h"
36 #include "llvm/CodeGen/Analysis.h"
37 #include "llvm/CodeGen/CallingConvLower.h"
38 #include "llvm/CodeGen/ISDOpcodes.h"
39 #include "llvm/CodeGen/MachineBasicBlock.h"
40 #include "llvm/CodeGen/MachineFrameInfo.h"
41 #include "llvm/CodeGen/MachineFunction.h"
42 #include "llvm/CodeGen/MachineInstr.h"
43 #include "llvm/CodeGen/MachineInstrBuilder.h"
44 #include "llvm/CodeGen/MachineMemOperand.h"
45 #include "llvm/CodeGen/MachineRegisterInfo.h"
46 #include "llvm/CodeGen/RuntimeLibcalls.h"
47 #include "llvm/CodeGen/SelectionDAG.h"
48 #include "llvm/CodeGen/SelectionDAGNodes.h"
49 #include "llvm/CodeGen/TargetCallingConv.h"
50 #include "llvm/CodeGen/TargetInstrInfo.h"
51 #include "llvm/CodeGen/ValueTypes.h"
52 #include "llvm/IR/Attributes.h"
53 #include "llvm/IR/Constants.h"
54 #include "llvm/IR/DataLayout.h"
55 #include "llvm/IR/DebugLoc.h"
56 #include "llvm/IR/DerivedTypes.h"
57 #include "llvm/IR/Function.h"
58 #include "llvm/IR/GetElementPtrTypeIterator.h"
59 #include "llvm/IR/GlobalValue.h"
60 #include "llvm/IR/IRBuilder.h"
61 #include "llvm/IR/Instruction.h"
62 #include "llvm/IR/Instructions.h"
63 #include "llvm/IR/IntrinsicInst.h"
64 #include "llvm/IR/Intrinsics.h"
65 #include "llvm/IR/IntrinsicsAArch64.h"
66 #include "llvm/IR/Module.h"
67 #include "llvm/IR/OperandTraits.h"
68 #include "llvm/IR/PatternMatch.h"
69 #include "llvm/IR/Type.h"
70 #include "llvm/IR/Use.h"
71 #include "llvm/IR/Value.h"
72 #include "llvm/MC/MCRegisterInfo.h"
73 #include "llvm/Support/Casting.h"
74 #include "llvm/Support/CodeGen.h"
75 #include "llvm/Support/CommandLine.h"
76 #include "llvm/Support/Compiler.h"
77 #include "llvm/Support/Debug.h"
78 #include "llvm/Support/ErrorHandling.h"
79 #include "llvm/Support/InstructionCost.h"
80 #include "llvm/Support/KnownBits.h"
81 #include "llvm/Support/MachineValueType.h"
82 #include "llvm/Support/MathExtras.h"
83 #include "llvm/Support/raw_ostream.h"
84 #include "llvm/Target/TargetMachine.h"
85 #include "llvm/Target/TargetOptions.h"
99 using namespace llvm::PatternMatch;
101 #define DEBUG_TYPE "aarch64-lower"
103 STATISTIC(NumTailCalls, "Number of tail calls");
104 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
105 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
107 // FIXME: The necessary dtprel relocations don't seem to be supported
108 // well in the GNU bfd and gold linkers at the moment. Therefore, by
109 // default, for now, fall back to GeneralDynamic code generation.
110 cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
111 "aarch64-elf-ldtls-generation", cl::Hidden,
112 cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
116 EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
117 cl::desc("Enable AArch64 logical imm instruction "
121 // Temporary option added for the purpose of testing functionality added
122 // to DAGCombiner.cpp in D92230. It is expected that this can be removed
123 // in future when both implementations will be based off MGATHER rather
124 // than the GLD1 nodes added for the SVE gather load intrinsics.
126 EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
127 cl::desc("Combine extends of AArch64 masked "
128 "gather intrinsics"),
131 /// Value type used for condition codes.
132 static const MVT MVT_CC = MVT::i32;
134 static inline EVT getPackedSVEVectorVT(EVT VT) {
135 switch (VT.getSimpleVT().SimpleTy) {
137 llvm_unreachable("unexpected element type for vector");
153 return MVT::nxv8bf16;
157 // NOTE: Currently there's only a need to return integer vector types. If this
158 // changes then just add an extra "type" parameter.
159 static inline EVT getPackedSVEVectorVT(ElementCount EC) {
160 switch (EC.getKnownMinValue()) {
162 llvm_unreachable("unexpected element count for vector");
174 static inline EVT getPromotedVTForPredicate(EVT VT) {
175 assert(VT.isScalableVector() && (VT.getVectorElementType() == MVT::i1) &&
176 "Expected scalable predicate vector type!");
177 switch (VT.getVectorMinNumElements()) {
179 llvm_unreachable("unexpected element count for vector");
191 /// Returns true if VT's elements occupy the lowest bit positions of its
192 /// associated register class without any intervening space.
194 /// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
195 /// same register class, but only nxv8f16 can be treated as a packed vector.
196 static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
197 assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
198 "Expected legal vector type!");
199 return VT.isFixedLengthVector() ||
200 VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
203 // Returns true for ####_MERGE_PASSTHRU opcodes, whose operands have a leading
204 // predicate and end with a passthru value matching the result type.
205 static bool isMergePassthruOpcode(unsigned Opc) {
209 case AArch64ISD::BITREVERSE_MERGE_PASSTHRU:
210 case AArch64ISD::BSWAP_MERGE_PASSTHRU:
211 case AArch64ISD::REVH_MERGE_PASSTHRU:
212 case AArch64ISD::REVW_MERGE_PASSTHRU:
213 case AArch64ISD::REVD_MERGE_PASSTHRU:
214 case AArch64ISD::CTLZ_MERGE_PASSTHRU:
215 case AArch64ISD::CTPOP_MERGE_PASSTHRU:
216 case AArch64ISD::DUP_MERGE_PASSTHRU:
217 case AArch64ISD::ABS_MERGE_PASSTHRU:
218 case AArch64ISD::NEG_MERGE_PASSTHRU:
219 case AArch64ISD::FNEG_MERGE_PASSTHRU:
220 case AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU:
221 case AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU:
222 case AArch64ISD::FCEIL_MERGE_PASSTHRU:
223 case AArch64ISD::FFLOOR_MERGE_PASSTHRU:
224 case AArch64ISD::FNEARBYINT_MERGE_PASSTHRU:
225 case AArch64ISD::FRINT_MERGE_PASSTHRU:
226 case AArch64ISD::FROUND_MERGE_PASSTHRU:
227 case AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU:
228 case AArch64ISD::FTRUNC_MERGE_PASSTHRU:
229 case AArch64ISD::FP_ROUND_MERGE_PASSTHRU:
230 case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU:
231 case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU:
232 case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU:
233 case AArch64ISD::FCVTZU_MERGE_PASSTHRU:
234 case AArch64ISD::FCVTZS_MERGE_PASSTHRU:
235 case AArch64ISD::FSQRT_MERGE_PASSTHRU:
236 case AArch64ISD::FRECPX_MERGE_PASSTHRU:
237 case AArch64ISD::FABS_MERGE_PASSTHRU:
242 // Returns true if inactive lanes are known to be zeroed by construction.
243 static bool isZeroingInactiveLanes(SDValue Op) {
244 switch (Op.getOpcode()) {
246 // We guarantee i1 splat_vectors to zero the other lanes by
247 // implementing it with ptrue and possibly a punpklo for nxv1i1.
248 if (ISD::isConstantSplatVectorAllOnes(Op.getNode()))
251 case AArch64ISD::PTRUE:
252 case AArch64ISD::SETCC_MERGE_ZERO:
254 case ISD::INTRINSIC_WO_CHAIN:
255 switch (Op.getConstantOperandVal(0)) {
258 case Intrinsic::aarch64_sve_ptrue:
259 case Intrinsic::aarch64_sve_pnext:
260 case Intrinsic::aarch64_sve_cmpeq:
261 case Intrinsic::aarch64_sve_cmpne:
262 case Intrinsic::aarch64_sve_cmpge:
263 case Intrinsic::aarch64_sve_cmpgt:
264 case Intrinsic::aarch64_sve_cmphs:
265 case Intrinsic::aarch64_sve_cmphi:
266 case Intrinsic::aarch64_sve_cmpeq_wide:
267 case Intrinsic::aarch64_sve_cmpne_wide:
268 case Intrinsic::aarch64_sve_cmpge_wide:
269 case Intrinsic::aarch64_sve_cmpgt_wide:
270 case Intrinsic::aarch64_sve_cmplt_wide:
271 case Intrinsic::aarch64_sve_cmple_wide:
272 case Intrinsic::aarch64_sve_cmphs_wide:
273 case Intrinsic::aarch64_sve_cmphi_wide:
274 case Intrinsic::aarch64_sve_cmplo_wide:
275 case Intrinsic::aarch64_sve_cmpls_wide:
276 case Intrinsic::aarch64_sve_fcmpeq:
277 case Intrinsic::aarch64_sve_fcmpne:
278 case Intrinsic::aarch64_sve_fcmpge:
279 case Intrinsic::aarch64_sve_fcmpgt:
280 case Intrinsic::aarch64_sve_fcmpuo:
286 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
287 const AArch64Subtarget &STI)
288 : TargetLowering(TM), Subtarget(&STI) {
289 // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
290 // we have to make something up. Arbitrarily, choose ZeroOrOne.
291 setBooleanContents(ZeroOrOneBooleanContent);
292 // When comparing vectors the result sets the different elements in the
293 // vector to all-one or all-zero.
294 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
296 // Set up the register classes.
297 addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
298 addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
300 if (Subtarget->hasLS64()) {
301 addRegisterClass(MVT::i64x8, &AArch64::GPR64x8ClassRegClass);
302 setOperationAction(ISD::LOAD, MVT::i64x8, Custom);
303 setOperationAction(ISD::STORE, MVT::i64x8, Custom);
306 if (Subtarget->hasFPARMv8()) {
307 addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
308 addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
309 addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
310 addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
311 addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
314 if (Subtarget->hasNEON()) {
315 addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
316 addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
317 // Someone set us up the NEON.
318 addDRTypeForNEON(MVT::v2f32);
319 addDRTypeForNEON(MVT::v8i8);
320 addDRTypeForNEON(MVT::v4i16);
321 addDRTypeForNEON(MVT::v2i32);
322 addDRTypeForNEON(MVT::v1i64);
323 addDRTypeForNEON(MVT::v1f64);
324 addDRTypeForNEON(MVT::v4f16);
325 if (Subtarget->hasBF16())
326 addDRTypeForNEON(MVT::v4bf16);
328 addQRTypeForNEON(MVT::v4f32);
329 addQRTypeForNEON(MVT::v2f64);
330 addQRTypeForNEON(MVT::v16i8);
331 addQRTypeForNEON(MVT::v8i16);
332 addQRTypeForNEON(MVT::v4i32);
333 addQRTypeForNEON(MVT::v2i64);
334 addQRTypeForNEON(MVT::v8f16);
335 if (Subtarget->hasBF16())
336 addQRTypeForNEON(MVT::v8bf16);
339 if (Subtarget->hasSVE() || Subtarget->hasSME()) {
340 // Add legal sve predicate types
341 addRegisterClass(MVT::nxv1i1, &AArch64::PPRRegClass);
342 addRegisterClass(MVT::nxv2i1, &AArch64::PPRRegClass);
343 addRegisterClass(MVT::nxv4i1, &AArch64::PPRRegClass);
344 addRegisterClass(MVT::nxv8i1, &AArch64::PPRRegClass);
345 addRegisterClass(MVT::nxv16i1, &AArch64::PPRRegClass);
347 // Add legal sve data types
348 addRegisterClass(MVT::nxv16i8, &AArch64::ZPRRegClass);
349 addRegisterClass(MVT::nxv8i16, &AArch64::ZPRRegClass);
350 addRegisterClass(MVT::nxv4i32, &AArch64::ZPRRegClass);
351 addRegisterClass(MVT::nxv2i64, &AArch64::ZPRRegClass);
353 addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
354 addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
355 addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
356 addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
357 addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
358 addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
360 if (Subtarget->hasBF16()) {
361 addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
362 addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
363 addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
366 if (Subtarget->useSVEForFixedLengthVectors()) {
367 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
368 if (useSVEForFixedLengthVectorVT(VT))
369 addRegisterClass(VT, &AArch64::ZPRRegClass);
371 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
372 if (useSVEForFixedLengthVectorVT(VT))
373 addRegisterClass(VT, &AArch64::ZPRRegClass);
377 // Compute derived properties from the register classes
378 computeRegisterProperties(Subtarget->getRegisterInfo());
380 // Provide all sorts of operation actions
381 setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
382 setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
383 setOperationAction(ISD::SETCC, MVT::i32, Custom);
384 setOperationAction(ISD::SETCC, MVT::i64, Custom);
385 setOperationAction(ISD::SETCC, MVT::f16, Custom);
386 setOperationAction(ISD::SETCC, MVT::f32, Custom);
387 setOperationAction(ISD::SETCC, MVT::f64, Custom);
388 setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
389 setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
390 setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
391 setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
392 setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
393 setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
394 setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
395 setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
396 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
397 setOperationAction(ISD::BR_CC, MVT::i32, Custom);
398 setOperationAction(ISD::BR_CC, MVT::i64, Custom);
399 setOperationAction(ISD::BR_CC, MVT::f16, Custom);
400 setOperationAction(ISD::BR_CC, MVT::f32, Custom);
401 setOperationAction(ISD::BR_CC, MVT::f64, Custom);
402 setOperationAction(ISD::SELECT, MVT::i32, Custom);
403 setOperationAction(ISD::SELECT, MVT::i64, Custom);
404 setOperationAction(ISD::SELECT, MVT::f16, Custom);
405 setOperationAction(ISD::SELECT, MVT::f32, Custom);
406 setOperationAction(ISD::SELECT, MVT::f64, Custom);
407 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
408 setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
409 setOperationAction(ISD::SELECT_CC, MVT::f16, Custom);
410 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
411 setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
412 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
413 setOperationAction(ISD::JumpTable, MVT::i64, Custom);
415 setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
416 setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
417 setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
419 setOperationAction(ISD::FREM, MVT::f32, Expand);
420 setOperationAction(ISD::FREM, MVT::f64, Expand);
421 setOperationAction(ISD::FREM, MVT::f80, Expand);
423 setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
425 // Custom lowering hooks are needed for XOR
426 // to fold it into CSINC/CSINV.
427 setOperationAction(ISD::XOR, MVT::i32, Custom);
428 setOperationAction(ISD::XOR, MVT::i64, Custom);
430 // Virtually no operation on f128 is legal, but LLVM can't expand them when
431 // there's a valid register class, so we need custom operations in most cases.
432 setOperationAction(ISD::FABS, MVT::f128, Expand);
433 setOperationAction(ISD::FADD, MVT::f128, LibCall);
434 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
435 setOperationAction(ISD::FCOS, MVT::f128, Expand);
436 setOperationAction(ISD::FDIV, MVT::f128, LibCall);
437 setOperationAction(ISD::FMA, MVT::f128, Expand);
438 setOperationAction(ISD::FMUL, MVT::f128, LibCall);
439 setOperationAction(ISD::FNEG, MVT::f128, Expand);
440 setOperationAction(ISD::FPOW, MVT::f128, Expand);
441 setOperationAction(ISD::FREM, MVT::f128, Expand);
442 setOperationAction(ISD::FRINT, MVT::f128, Expand);
443 setOperationAction(ISD::FSIN, MVT::f128, Expand);
444 setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
445 setOperationAction(ISD::FSQRT, MVT::f128, Expand);
446 setOperationAction(ISD::FSUB, MVT::f128, LibCall);
447 setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
448 setOperationAction(ISD::SETCC, MVT::f128, Custom);
449 setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
450 setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
451 setOperationAction(ISD::BR_CC, MVT::f128, Custom);
452 setOperationAction(ISD::SELECT, MVT::f128, Custom);
453 setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
454 setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
455 // FIXME: f128 FMINIMUM and FMAXIMUM (including STRICT versions) currently
458 // Lowering for many of the conversions is actually specified by the non-f128
459 // type. The LowerXXX function will be trivial when f128 isn't involved.
460 setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
461 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
462 setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
463 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
464 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
465 setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
466 setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
467 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
468 setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
469 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
470 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
471 setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
472 setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
473 setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
474 setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
475 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
476 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
477 setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
478 setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
479 setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
480 setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
481 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
482 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
483 setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
484 setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
485 setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
486 setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
487 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom);
488 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
489 setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
491 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i32, Custom);
492 setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom);
493 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i32, Custom);
494 setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom);
496 // Variable arguments.
497 setOperationAction(ISD::VASTART, MVT::Other, Custom);
498 setOperationAction(ISD::VAARG, MVT::Other, Custom);
499 setOperationAction(ISD::VACOPY, MVT::Other, Custom);
500 setOperationAction(ISD::VAEND, MVT::Other, Expand);
502 // Variable-sized objects.
503 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
504 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
506 if (Subtarget->isTargetWindows())
507 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
509 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
511 // Constant pool entries
512 setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
515 setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
517 // AArch64 lacks both left-rotate and popcount instructions.
518 setOperationAction(ISD::ROTL, MVT::i32, Expand);
519 setOperationAction(ISD::ROTL, MVT::i64, Expand);
520 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
521 setOperationAction(ISD::ROTL, VT, Expand);
522 setOperationAction(ISD::ROTR, VT, Expand);
525 // AArch64 doesn't have i32 MULH{S|U}.
526 setOperationAction(ISD::MULHU, MVT::i32, Expand);
527 setOperationAction(ISD::MULHS, MVT::i32, Expand);
529 // AArch64 doesn't have {U|S}MUL_LOHI.
530 setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
531 setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
533 setOperationAction(ISD::CTPOP, MVT::i32, Custom);
534 setOperationAction(ISD::CTPOP, MVT::i64, Custom);
535 setOperationAction(ISD::CTPOP, MVT::i128, Custom);
537 setOperationAction(ISD::PARITY, MVT::i64, Custom);
538 setOperationAction(ISD::PARITY, MVT::i128, Custom);
540 setOperationAction(ISD::ABS, MVT::i32, Custom);
541 setOperationAction(ISD::ABS, MVT::i64, Custom);
543 setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
544 setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
545 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
546 setOperationAction(ISD::SDIVREM, VT, Expand);
547 setOperationAction(ISD::UDIVREM, VT, Expand);
549 setOperationAction(ISD::SREM, MVT::i32, Expand);
550 setOperationAction(ISD::SREM, MVT::i64, Expand);
551 setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
552 setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
553 setOperationAction(ISD::UREM, MVT::i32, Expand);
554 setOperationAction(ISD::UREM, MVT::i64, Expand);
556 // Custom lower Add/Sub/Mul with overflow.
557 setOperationAction(ISD::SADDO, MVT::i32, Custom);
558 setOperationAction(ISD::SADDO, MVT::i64, Custom);
559 setOperationAction(ISD::UADDO, MVT::i32, Custom);
560 setOperationAction(ISD::UADDO, MVT::i64, Custom);
561 setOperationAction(ISD::SSUBO, MVT::i32, Custom);
562 setOperationAction(ISD::SSUBO, MVT::i64, Custom);
563 setOperationAction(ISD::USUBO, MVT::i32, Custom);
564 setOperationAction(ISD::USUBO, MVT::i64, Custom);
565 setOperationAction(ISD::SMULO, MVT::i32, Custom);
566 setOperationAction(ISD::SMULO, MVT::i64, Custom);
567 setOperationAction(ISD::UMULO, MVT::i32, Custom);
568 setOperationAction(ISD::UMULO, MVT::i64, Custom);
570 setOperationAction(ISD::ADDCARRY, MVT::i32, Custom);
571 setOperationAction(ISD::ADDCARRY, MVT::i64, Custom);
572 setOperationAction(ISD::SUBCARRY, MVT::i32, Custom);
573 setOperationAction(ISD::SUBCARRY, MVT::i64, Custom);
574 setOperationAction(ISD::SADDO_CARRY, MVT::i32, Custom);
575 setOperationAction(ISD::SADDO_CARRY, MVT::i64, Custom);
576 setOperationAction(ISD::SSUBO_CARRY, MVT::i32, Custom);
577 setOperationAction(ISD::SSUBO_CARRY, MVT::i64, Custom);
579 setOperationAction(ISD::FSIN, MVT::f32, Expand);
580 setOperationAction(ISD::FSIN, MVT::f64, Expand);
581 setOperationAction(ISD::FCOS, MVT::f32, Expand);
582 setOperationAction(ISD::FCOS, MVT::f64, Expand);
583 setOperationAction(ISD::FPOW, MVT::f32, Expand);
584 setOperationAction(ISD::FPOW, MVT::f64, Expand);
585 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
586 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
587 if (Subtarget->hasFullFP16())
588 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Custom);
590 setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
592 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
593 ISD::FCOS, ISD::FSIN, ISD::FSINCOS,
594 ISD::FEXP, ISD::FEXP2, ISD::FLOG,
595 ISD::FLOG2, ISD::FLOG10, ISD::STRICT_FREM,
596 ISD::STRICT_FPOW, ISD::STRICT_FPOWI, ISD::STRICT_FCOS,
597 ISD::STRICT_FSIN, ISD::STRICT_FEXP, ISD::STRICT_FEXP2,
598 ISD::STRICT_FLOG, ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
599 setOperationAction(Op, MVT::f16, Promote);
600 setOperationAction(Op, MVT::v4f16, Expand);
601 setOperationAction(Op, MVT::v8f16, Expand);
604 if (!Subtarget->hasFullFP16()) {
606 {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC,
607 ISD::BR_CC, ISD::FADD, ISD::FSUB,
608 ISD::FMUL, ISD::FDIV, ISD::FMA,
609 ISD::FNEG, ISD::FABS, ISD::FCEIL,
610 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
611 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
612 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
613 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
614 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
615 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
616 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
617 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
618 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
619 ISD::STRICT_FMAXIMUM})
620 setOperationAction(Op, MVT::f16, Promote);
622 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
623 // because the result type is integer.
624 for (auto Op : {ISD::STRICT_LROUND, ISD::STRICT_LLROUND, ISD::STRICT_LRINT,
626 setOperationAction(Op, MVT::f16, Custom);
628 // promote v4f16 to v4f32 when that is known to be safe.
629 setOperationAction(ISD::FADD, MVT::v4f16, Promote);
630 setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
631 setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
632 setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
633 AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
634 AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
635 AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
636 AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
638 setOperationAction(ISD::FABS, MVT::v4f16, Expand);
639 setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
640 setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
641 setOperationAction(ISD::FROUNDEVEN, MVT::v4f16, Expand);
642 setOperationAction(ISD::FMA, MVT::v4f16, Expand);
643 setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
644 setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
645 setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
646 setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
647 setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
648 setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
649 setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
650 setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
651 setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
652 setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
653 setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
655 setOperationAction(ISD::FABS, MVT::v8f16, Expand);
656 setOperationAction(ISD::FADD, MVT::v8f16, Expand);
657 setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
658 setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
659 setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
660 setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
661 setOperationAction(ISD::FMA, MVT::v8f16, Expand);
662 setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
663 setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
664 setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
665 setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
666 setOperationAction(ISD::FROUNDEVEN, MVT::v8f16, Expand);
667 setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
668 setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
669 setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
670 setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
671 setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
672 setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
673 setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
674 setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
675 setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
678 // AArch64 has implementations of a lot of rounding-like FP operations.
680 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL,
681 ISD::FRINT, ISD::FTRUNC, ISD::FROUND,
682 ISD::FROUNDEVEN, ISD::FMINNUM, ISD::FMAXNUM,
683 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::LROUND,
684 ISD::LLROUND, ISD::LRINT, ISD::LLRINT,
685 ISD::STRICT_FFLOOR, ISD::STRICT_FCEIL, ISD::STRICT_FNEARBYINT,
686 ISD::STRICT_FRINT, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
687 ISD::STRICT_FROUND, ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM,
688 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_LROUND,
689 ISD::STRICT_LLROUND, ISD::STRICT_LRINT, ISD::STRICT_LLRINT}) {
690 for (MVT Ty : {MVT::f32, MVT::f64})
691 setOperationAction(Op, Ty, Legal);
692 if (Subtarget->hasFullFP16())
693 setOperationAction(Op, MVT::f16, Legal);
696 // Basic strict FP operations are legal
697 for (auto Op : {ISD::STRICT_FADD, ISD::STRICT_FSUB, ISD::STRICT_FMUL,
698 ISD::STRICT_FDIV, ISD::STRICT_FMA, ISD::STRICT_FSQRT}) {
699 for (MVT Ty : {MVT::f32, MVT::f64})
700 setOperationAction(Op, Ty, Legal);
701 if (Subtarget->hasFullFP16())
702 setOperationAction(Op, MVT::f16, Legal);
705 // Strict conversion to a larger type is legal
706 for (auto VT : {MVT::f32, MVT::f64})
707 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
709 setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
711 setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
712 setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
714 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
715 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
716 setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
717 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
718 setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
720 // Generate outline atomics library calls only if LSE was not specified for
722 if (Subtarget->outlineAtomics() && !Subtarget->hasLSE()) {
723 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, LibCall);
724 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, LibCall);
725 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, LibCall);
726 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, LibCall);
727 setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, LibCall);
728 setOperationAction(ISD::ATOMIC_SWAP, MVT::i8, LibCall);
729 setOperationAction(ISD::ATOMIC_SWAP, MVT::i16, LibCall);
730 setOperationAction(ISD::ATOMIC_SWAP, MVT::i32, LibCall);
731 setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, LibCall);
732 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i8, LibCall);
733 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i16, LibCall);
734 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i32, LibCall);
735 setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, LibCall);
736 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i8, LibCall);
737 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i16, LibCall);
738 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i32, LibCall);
739 setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, LibCall);
740 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i8, LibCall);
741 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i16, LibCall);
742 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i32, LibCall);
743 setOperationAction(ISD::ATOMIC_LOAD_CLR, MVT::i64, LibCall);
744 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i8, LibCall);
745 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i16, LibCall);
746 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i32, LibCall);
747 setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, LibCall);
748 #define LCALLNAMES(A, B, N) \
749 setLibcallName(A##N##_RELAX, #B #N "_relax"); \
750 setLibcallName(A##N##_ACQ, #B #N "_acq"); \
751 setLibcallName(A##N##_REL, #B #N "_rel"); \
752 setLibcallName(A##N##_ACQ_REL, #B #N "_acq_rel");
753 #define LCALLNAME4(A, B) \
754 LCALLNAMES(A, B, 1) \
755 LCALLNAMES(A, B, 2) LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8)
756 #define LCALLNAME5(A, B) \
757 LCALLNAMES(A, B, 1) \
758 LCALLNAMES(A, B, 2) \
759 LCALLNAMES(A, B, 4) LCALLNAMES(A, B, 8) LCALLNAMES(A, B, 16)
760 LCALLNAME5(RTLIB::OUTLINE_ATOMIC_CAS, __aarch64_cas)
761 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_SWP, __aarch64_swp)
762 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDADD, __aarch64_ldadd)
763 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDSET, __aarch64_ldset)
764 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDCLR, __aarch64_ldclr)
765 LCALLNAME4(RTLIB::OUTLINE_ATOMIC_LDEOR, __aarch64_ldeor)
771 // 128-bit loads and stores can be done without expanding
772 setOperationAction(ISD::LOAD, MVT::i128, Custom);
773 setOperationAction(ISD::STORE, MVT::i128, Custom);
775 // Aligned 128-bit loads and stores are single-copy atomic according to the
777 if (Subtarget->hasLSE2()) {
778 setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
779 setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
782 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
783 // custom lowering, as there are no un-paired non-temporal stores and
784 // legalization will break up 256 bit inputs.
785 setOperationAction(ISD::STORE, MVT::v32i8, Custom);
786 setOperationAction(ISD::STORE, MVT::v16i16, Custom);
787 setOperationAction(ISD::STORE, MVT::v16f16, Custom);
788 setOperationAction(ISD::STORE, MVT::v8i32, Custom);
789 setOperationAction(ISD::STORE, MVT::v8f32, Custom);
790 setOperationAction(ISD::STORE, MVT::v4f64, Custom);
791 setOperationAction(ISD::STORE, MVT::v4i64, Custom);
793 // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
794 // This requires the Performance Monitors extension.
795 if (Subtarget->hasPerfMon())
796 setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
798 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
799 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
800 // Issue __sincos_stret if available.
801 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
802 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
804 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
805 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
808 if (Subtarget->getTargetTriple().isOSMSVCRT()) {
809 // MSVCRT doesn't have powi; fall back to pow
810 setLibcallName(RTLIB::POWI_F32, nullptr);
811 setLibcallName(RTLIB::POWI_F64, nullptr);
814 // Make floating-point constants legal for the large code model, so they don't
815 // become loads from the constant pool.
816 if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
817 setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
818 setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
821 // AArch64 does not have floating-point extending loads, i1 sign-extending
822 // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
823 for (MVT VT : MVT::fp_valuetypes()) {
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
829 for (MVT VT : MVT::integer_valuetypes())
830 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);
832 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
833 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
834 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
835 setTruncStoreAction(MVT::f128, MVT::f80, Expand);
836 setTruncStoreAction(MVT::f128, MVT::f64, Expand);
837 setTruncStoreAction(MVT::f128, MVT::f32, Expand);
838 setTruncStoreAction(MVT::f128, MVT::f16, Expand);
840 setOperationAction(ISD::BITCAST, MVT::i16, Custom);
841 setOperationAction(ISD::BITCAST, MVT::f16, Custom);
842 setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
844 // Indexed loads and stores are supported.
845 for (unsigned im = (unsigned)ISD::PRE_INC;
846 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
847 setIndexedLoadAction(im, MVT::i8, Legal);
848 setIndexedLoadAction(im, MVT::i16, Legal);
849 setIndexedLoadAction(im, MVT::i32, Legal);
850 setIndexedLoadAction(im, MVT::i64, Legal);
851 setIndexedLoadAction(im, MVT::f64, Legal);
852 setIndexedLoadAction(im, MVT::f32, Legal);
853 setIndexedLoadAction(im, MVT::f16, Legal);
854 setIndexedLoadAction(im, MVT::bf16, Legal);
855 setIndexedStoreAction(im, MVT::i8, Legal);
856 setIndexedStoreAction(im, MVT::i16, Legal);
857 setIndexedStoreAction(im, MVT::i32, Legal);
858 setIndexedStoreAction(im, MVT::i64, Legal);
859 setIndexedStoreAction(im, MVT::f64, Legal);
860 setIndexedStoreAction(im, MVT::f32, Legal);
861 setIndexedStoreAction(im, MVT::f16, Legal);
862 setIndexedStoreAction(im, MVT::bf16, Legal);
866 setOperationAction(ISD::TRAP, MVT::Other, Legal);
867 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
868 setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal);
870 // We combine OR nodes for bitfield operations.
871 setTargetDAGCombine(ISD::OR);
872 // Try to create BICs for vector ANDs.
873 setTargetDAGCombine(ISD::AND);
875 // Vector add and sub nodes may conceal a high-half opportunity.
876 // Also, try to fold ADD into CSINC/CSINV..
877 setTargetDAGCombine({ISD::ADD, ISD::ABS, ISD::SUB, ISD::XOR, ISD::SINT_TO_FP,
880 setTargetDAGCombine({ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
881 ISD::FP_TO_UINT_SAT, ISD::FDIV});
883 // Try and combine setcc with csel
884 setTargetDAGCombine(ISD::SETCC);
886 setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
888 setTargetDAGCombine({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND,
889 ISD::VECTOR_SPLICE, ISD::SIGN_EXTEND_INREG,
890 ISD::CONCAT_VECTORS, ISD::EXTRACT_SUBVECTOR,
891 ISD::INSERT_SUBVECTOR, ISD::STORE, ISD::BUILD_VECTOR});
892 if (Subtarget->supportsAddressTopByteIgnored())
893 setTargetDAGCombine(ISD::LOAD);
895 setTargetDAGCombine(ISD::MSTORE);
897 setTargetDAGCombine(ISD::MUL);
899 setTargetDAGCombine({ISD::SELECT, ISD::VSELECT});
901 setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
902 ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT,
903 ISD::VECREDUCE_ADD, ISD::STEP_VECTOR});
905 setTargetDAGCombine({ISD::MGATHER, ISD::MSCATTER});
907 setTargetDAGCombine(ISD::FP_EXTEND);
909 setTargetDAGCombine(ISD::GlobalAddress);
911 // In case of strict alignment, avoid an excessive number of byte wide stores.
912 MaxStoresPerMemsetOptSize = 8;
914 Subtarget->requiresStrictAlign() ? MaxStoresPerMemsetOptSize : 32;
916 MaxGluedStoresPerMemcpy = 4;
917 MaxStoresPerMemcpyOptSize = 4;
919 Subtarget->requiresStrictAlign() ? MaxStoresPerMemcpyOptSize : 16;
921 MaxStoresPerMemmoveOptSize = 4;
922 MaxStoresPerMemmove = 4;
924 MaxLoadsPerMemcmpOptSize = 4;
926 Subtarget->requiresStrictAlign() ? MaxLoadsPerMemcmpOptSize : 8;
928 setStackPointerRegisterToSaveRestore(AArch64::SP);
930 setSchedulingPreference(Sched::Hybrid);
932 EnableExtLdPromotion = true;
934 // Set required alignment.
935 setMinFunctionAlignment(Align(4));
936 // Set preferred alignments.
937 setPrefLoopAlignment(Align(1ULL << STI.getPrefLoopLogAlignment()));
938 setMaxBytesForAlignment(STI.getMaxBytesForLoopAlignment());
939 setPrefFunctionAlignment(Align(1ULL << STI.getPrefFunctionLogAlignment()));
941 // Only change the limit for entries in a jump table if specified by
942 // the sub target, but not at the command line.
943 unsigned MaxJT = STI.getMaximumJumpTableSize();
944 if (MaxJT && getMaximumJumpTableSize() == UINT_MAX)
945 setMaximumJumpTableSize(MaxJT);
947 setHasExtractBitsInsn(true);
949 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
951 if (Subtarget->hasNEON()) {
952 // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
953 // silliness like this:
955 {ISD::SELECT, ISD::SELECT_CC, ISD::SETCC,
956 ISD::BR_CC, ISD::FADD, ISD::FSUB,
957 ISD::FMUL, ISD::FDIV, ISD::FMA,
958 ISD::FNEG, ISD::FABS, ISD::FCEIL,
959 ISD::FSQRT, ISD::FFLOOR, ISD::FNEARBYINT,
960 ISD::FRINT, ISD::FROUND, ISD::FROUNDEVEN,
961 ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM,
962 ISD::FMINIMUM, ISD::FMAXIMUM, ISD::STRICT_FADD,
963 ISD::STRICT_FSUB, ISD::STRICT_FMUL, ISD::STRICT_FDIV,
964 ISD::STRICT_FMA, ISD::STRICT_FCEIL, ISD::STRICT_FFLOOR,
965 ISD::STRICT_FSQRT, ISD::STRICT_FRINT, ISD::STRICT_FNEARBYINT,
966 ISD::STRICT_FROUND, ISD::STRICT_FTRUNC, ISD::STRICT_FROUNDEVEN,
967 ISD::STRICT_FMINNUM, ISD::STRICT_FMAXNUM, ISD::STRICT_FMINIMUM,
968 ISD::STRICT_FMAXIMUM})
969 setOperationAction(Op, MVT::v1f64, Expand);
972 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP,
973 ISD::FP_ROUND, ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT, ISD::MUL,
974 ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT,
975 ISD::STRICT_SINT_TO_FP, ISD::STRICT_UINT_TO_FP, ISD::STRICT_FP_ROUND})
976 setOperationAction(Op, MVT::v1i64, Expand);
978 // AArch64 doesn't have a direct vector ->f32 conversion instructions for
979 // elements smaller than i32, so promote the input to i32 first.
980 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i8, MVT::v4i32);
981 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i8, MVT::v4i32);
983 // Similarly, there is no direct i32 -> f64 vector conversion instruction.
984 // Or, direct i32 -> f16 vector conversion. Set it so custom, so the
985 // conversion happens in two steps: v4i32 -> v4f32 -> v4f16
986 for (auto Op : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP,
987 ISD::STRICT_UINT_TO_FP})
988 for (auto VT : {MVT::v2i32, MVT::v2i64, MVT::v4i32})
989 setOperationAction(Op, VT, Custom);
991 if (Subtarget->hasFullFP16()) {
992 setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
994 setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Custom);
995 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
996 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Custom);
997 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
998 setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
999 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
1000 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Custom);
1001 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1003 // when AArch64 doesn't have fullfp16 support, promote the input
1005 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i8, MVT::v8i32);
1006 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i8, MVT::v8i32);
1007 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i8, MVT::v16i32);
1008 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i8, MVT::v16i32);
1009 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i16, MVT::v4i32);
1010 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i16, MVT::v4i32);
1011 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i16, MVT::v8i32);
1012 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i16, MVT::v8i32);
1015 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
1016 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
1017 setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
1018 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
1019 setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
1020 setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
1021 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1022 setOperationAction(ISD::BITREVERSE, MVT::v2i64, Custom);
1023 for (auto VT : {MVT::v1i64, MVT::v2i64}) {
1024 setOperationAction(ISD::UMAX, VT, Custom);
1025 setOperationAction(ISD::SMAX, VT, Custom);
1026 setOperationAction(ISD::UMIN, VT, Custom);
1027 setOperationAction(ISD::SMIN, VT, Custom);
1030 // AArch64 doesn't have MUL.2d:
1031 setOperationAction(ISD::MUL, MVT::v2i64, Expand);
1032 // Custom handling for some quad-vector types to detect MULL.
1033 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
1034 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
1035 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1038 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1039 MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1040 setOperationAction(ISD::SADDSAT, VT, Legal);
1041 setOperationAction(ISD::UADDSAT, VT, Legal);
1042 setOperationAction(ISD::SSUBSAT, VT, Legal);
1043 setOperationAction(ISD::USUBSAT, VT, Legal);
1046 for (MVT VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
1048 setOperationAction(ISD::AVGFLOORS, VT, Legal);
1049 setOperationAction(ISD::AVGFLOORU, VT, Legal);
1050 setOperationAction(ISD::AVGCEILS, VT, Legal);
1051 setOperationAction(ISD::AVGCEILU, VT, Legal);
1052 setOperationAction(ISD::ABDS, VT, Legal);
1053 setOperationAction(ISD::ABDU, VT, Legal);
1056 // Vector reductions
1057 for (MVT VT : { MVT::v4f16, MVT::v2f32,
1058 MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
1059 if (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()) {
1060 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1061 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1063 setOperationAction(ISD::VECREDUCE_FADD, VT, Legal);
1066 for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
1067 MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
1068 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1069 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1070 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1071 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1072 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1074 setOperationAction(ISD::VECREDUCE_ADD, MVT::v2i64, Custom);
1076 setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
1077 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
1078 // Likewise, narrowing and extending vector loads/stores aren't handled
1080 for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
1081 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
1083 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32) {
1084 setOperationAction(ISD::MULHS, VT, Legal);
1085 setOperationAction(ISD::MULHU, VT, Legal);
1087 setOperationAction(ISD::MULHS, VT, Expand);
1088 setOperationAction(ISD::MULHU, VT, Expand);
1090 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1091 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1093 setOperationAction(ISD::BSWAP, VT, Expand);
1094 setOperationAction(ISD::CTTZ, VT, Expand);
1096 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
1097 setTruncStoreAction(VT, InnerVT, Expand);
1098 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1099 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1100 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1104 // AArch64 has implementations of a lot of rounding-like FP operations.
1106 {ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
1107 ISD::FROUND, ISD::FROUNDEVEN, ISD::STRICT_FFLOOR,
1108 ISD::STRICT_FNEARBYINT, ISD::STRICT_FCEIL, ISD::STRICT_FRINT,
1109 ISD::STRICT_FTRUNC, ISD::STRICT_FROUND, ISD::STRICT_FROUNDEVEN}) {
1110 for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64})
1111 setOperationAction(Op, Ty, Legal);
1112 if (Subtarget->hasFullFP16())
1113 for (MVT Ty : {MVT::v4f16, MVT::v8f16})
1114 setOperationAction(Op, Ty, Legal);
1117 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
1119 setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1120 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1121 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i16, MVT::v4i8, Custom);
1122 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1123 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1124 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Custom);
1126 // ADDP custom lowering
1127 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
1128 setOperationAction(ISD::ADD, VT, Custom);
1129 // FADDP custom lowering
1130 for (MVT VT : { MVT::v16f16, MVT::v8f32, MVT::v4f64 })
1131 setOperationAction(ISD::FADD, VT, Custom);
1134 if (Subtarget->hasSME()) {
1135 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1138 // FIXME: Move lowering for more nodes here if those are common between
1140 if (Subtarget->hasSVE() || Subtarget->hasSME()) {
1142 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1143 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1144 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1148 if (Subtarget->hasSVE()) {
1149 for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
1150 setOperationAction(ISD::BITREVERSE, VT, Custom);
1151 setOperationAction(ISD::BSWAP, VT, Custom);
1152 setOperationAction(ISD::CTLZ, VT, Custom);
1153 setOperationAction(ISD::CTPOP, VT, Custom);
1154 setOperationAction(ISD::CTTZ, VT, Custom);
1155 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1156 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1157 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1158 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1159 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1160 setOperationAction(ISD::MGATHER, VT, Custom);
1161 setOperationAction(ISD::MSCATTER, VT, Custom);
1162 setOperationAction(ISD::MLOAD, VT, Custom);
1163 setOperationAction(ISD::MUL, VT, Custom);
1164 setOperationAction(ISD::MULHS, VT, Custom);
1165 setOperationAction(ISD::MULHU, VT, Custom);
1166 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1167 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1168 setOperationAction(ISD::SELECT, VT, Custom);
1169 setOperationAction(ISD::SETCC, VT, Custom);
1170 setOperationAction(ISD::SDIV, VT, Custom);
1171 setOperationAction(ISD::UDIV, VT, Custom);
1172 setOperationAction(ISD::SMIN, VT, Custom);
1173 setOperationAction(ISD::UMIN, VT, Custom);
1174 setOperationAction(ISD::SMAX, VT, Custom);
1175 setOperationAction(ISD::UMAX, VT, Custom);
1176 setOperationAction(ISD::SHL, VT, Custom);
1177 setOperationAction(ISD::SRL, VT, Custom);
1178 setOperationAction(ISD::SRA, VT, Custom);
1179 setOperationAction(ISD::ABS, VT, Custom);
1180 setOperationAction(ISD::ABDS, VT, Custom);
1181 setOperationAction(ISD::ABDU, VT, Custom);
1182 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1183 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1184 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1185 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1186 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1187 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1188 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1189 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1191 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
1192 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
1193 setOperationAction(ISD::SELECT_CC, VT, Expand);
1194 setOperationAction(ISD::ROTL, VT, Expand);
1195 setOperationAction(ISD::ROTR, VT, Expand);
1197 setOperationAction(ISD::SADDSAT, VT, Legal);
1198 setOperationAction(ISD::UADDSAT, VT, Legal);
1199 setOperationAction(ISD::SSUBSAT, VT, Legal);
1200 setOperationAction(ISD::USUBSAT, VT, Legal);
1201 setOperationAction(ISD::UREM, VT, Expand);
1202 setOperationAction(ISD::SREM, VT, Expand);
1203 setOperationAction(ISD::SDIVREM, VT, Expand);
1204 setOperationAction(ISD::UDIVREM, VT, Expand);
1207 // Illegal unpacked integer vector types.
1208 for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32}) {
1209 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1210 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1213 // Legalize unpacked bitcasts to REINTERPRET_CAST.
1214 for (auto VT : {MVT::nxv2i16, MVT::nxv4i16, MVT::nxv2i32, MVT::nxv2bf16,
1215 MVT::nxv4bf16, MVT::nxv2f16, MVT::nxv4f16, MVT::nxv2f32})
1216 setOperationAction(ISD::BITCAST, VT, Custom);
1219 { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
1220 MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
1221 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
1224 {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1, MVT::nxv1i1}) {
1225 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1226 setOperationAction(ISD::SELECT, VT, Custom);
1227 setOperationAction(ISD::SETCC, VT, Custom);
1228 setOperationAction(ISD::TRUNCATE, VT, Custom);
1229 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1230 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1231 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1233 setOperationAction(ISD::SELECT_CC, VT, Expand);
1234 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1235 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1237 // There are no legal MVT::nxv16f## based types.
1238 if (VT != MVT::nxv16i1) {
1239 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1240 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1244 // NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
1245 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
1246 MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1247 MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1248 setOperationAction(ISD::MLOAD, VT, Custom);
1249 setOperationAction(ISD::MSTORE, VT, Custom);
1250 setOperationAction(ISD::MGATHER, VT, Custom);
1251 setOperationAction(ISD::MSCATTER, VT, Custom);
1254 // Firstly, exclude all scalable vector extending loads/truncating stores,
1255 // include both integer and floating scalable vector.
1256 for (MVT VT : MVT::scalable_vector_valuetypes()) {
1257 for (MVT InnerVT : MVT::scalable_vector_valuetypes()) {
1258 setTruncStoreAction(VT, InnerVT, Expand);
1259 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
1260 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
1261 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
1265 // Then, selectively enable those which we directly support.
1266 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i8, Legal);
1267 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i16, Legal);
1268 setTruncStoreAction(MVT::nxv2i64, MVT::nxv2i32, Legal);
1269 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i8, Legal);
1270 setTruncStoreAction(MVT::nxv4i32, MVT::nxv4i16, Legal);
1271 setTruncStoreAction(MVT::nxv8i16, MVT::nxv8i8, Legal);
1272 for (auto Op : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1273 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i8, Legal);
1274 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i16, Legal);
1275 setLoadExtAction(Op, MVT::nxv2i64, MVT::nxv2i32, Legal);
1276 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i8, Legal);
1277 setLoadExtAction(Op, MVT::nxv4i32, MVT::nxv4i16, Legal);
1278 setLoadExtAction(Op, MVT::nxv8i16, MVT::nxv8i8, Legal);
1281 // SVE supports truncating stores of 64 and 128-bit vectors
1282 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Custom);
1283 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Custom);
1284 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Custom);
1285 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Custom);
1286 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Custom);
1288 for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
1289 MVT::nxv4f32, MVT::nxv2f64}) {
1290 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1291 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1292 setOperationAction(ISD::MGATHER, VT, Custom);
1293 setOperationAction(ISD::MSCATTER, VT, Custom);
1294 setOperationAction(ISD::MLOAD, VT, Custom);
1295 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1296 setOperationAction(ISD::SELECT, VT, Custom);
1297 setOperationAction(ISD::FADD, VT, Custom);
1298 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1299 setOperationAction(ISD::FDIV, VT, Custom);
1300 setOperationAction(ISD::FMA, VT, Custom);
1301 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1302 setOperationAction(ISD::FMAXNUM, VT, Custom);
1303 setOperationAction(ISD::FMINIMUM, VT, Custom);
1304 setOperationAction(ISD::FMINNUM, VT, Custom);
1305 setOperationAction(ISD::FMUL, VT, Custom);
1306 setOperationAction(ISD::FNEG, VT, Custom);
1307 setOperationAction(ISD::FSUB, VT, Custom);
1308 setOperationAction(ISD::FCEIL, VT, Custom);
1309 setOperationAction(ISD::FFLOOR, VT, Custom);
1310 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1311 setOperationAction(ISD::FRINT, VT, Custom);
1312 setOperationAction(ISD::FROUND, VT, Custom);
1313 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1314 setOperationAction(ISD::FTRUNC, VT, Custom);
1315 setOperationAction(ISD::FSQRT, VT, Custom);
1316 setOperationAction(ISD::FABS, VT, Custom);
1317 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1318 setOperationAction(ISD::FP_ROUND, VT, Custom);
1319 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1320 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1321 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1322 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1323 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1325 setOperationAction(ISD::SELECT_CC, VT, Expand);
1326 setOperationAction(ISD::FREM, VT, Expand);
1327 setOperationAction(ISD::FPOW, VT, Expand);
1328 setOperationAction(ISD::FPOWI, VT, Expand);
1329 setOperationAction(ISD::FCOS, VT, Expand);
1330 setOperationAction(ISD::FSIN, VT, Expand);
1331 setOperationAction(ISD::FSINCOS, VT, Expand);
1332 setOperationAction(ISD::FEXP, VT, Expand);
1333 setOperationAction(ISD::FEXP2, VT, Expand);
1334 setOperationAction(ISD::FLOG, VT, Expand);
1335 setOperationAction(ISD::FLOG2, VT, Expand);
1336 setOperationAction(ISD::FLOG10, VT, Expand);
1338 setCondCodeAction(ISD::SETO, VT, Expand);
1339 setCondCodeAction(ISD::SETOLT, VT, Expand);
1340 setCondCodeAction(ISD::SETLT, VT, Expand);
1341 setCondCodeAction(ISD::SETOLE, VT, Expand);
1342 setCondCodeAction(ISD::SETLE, VT, Expand);
1343 setCondCodeAction(ISD::SETULT, VT, Expand);
1344 setCondCodeAction(ISD::SETULE, VT, Expand);
1345 setCondCodeAction(ISD::SETUGE, VT, Expand);
1346 setCondCodeAction(ISD::SETUGT, VT, Expand);
1347 setCondCodeAction(ISD::SETUEQ, VT, Expand);
1348 setCondCodeAction(ISD::SETONE, VT, Expand);
1351 for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) {
1352 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1353 setOperationAction(ISD::MGATHER, VT, Custom);
1354 setOperationAction(ISD::MSCATTER, VT, Custom);
1355 setOperationAction(ISD::MLOAD, VT, Custom);
1356 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1357 setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
1360 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
1361 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
1363 // NEON doesn't support integer divides, but SVE does
1364 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
1365 MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
1366 setOperationAction(ISD::SDIV, VT, Custom);
1367 setOperationAction(ISD::UDIV, VT, Custom);
1370 // NEON doesn't support 64-bit vector integer muls, but SVE does.
1371 setOperationAction(ISD::MUL, MVT::v1i64, Custom);
1372 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
1374 // NOTE: Currently this has to happen after computeRegisterProperties rather
1375 // than the preferred option of combining it with the addRegisterClass call.
1376 if (Subtarget->useSVEForFixedLengthVectors()) {
1377 for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
1378 if (useSVEForFixedLengthVectorVT(VT))
1379 addTypeForFixedLengthSVE(VT);
1380 for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
1381 if (useSVEForFixedLengthVectorVT(VT))
1382 addTypeForFixedLengthSVE(VT);
1384 // 64bit results can mean a bigger than NEON input.
1385 for (auto VT : {MVT::v8i8, MVT::v4i16})
1386 setOperationAction(ISD::TRUNCATE, VT, Custom);
1387 setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
1389 // 128bit results imply a bigger than NEON input.
1390 for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
1391 setOperationAction(ISD::TRUNCATE, VT, Custom);
1392 for (auto VT : {MVT::v8f16, MVT::v4f32})
1393 setOperationAction(ISD::FP_ROUND, VT, Custom);
1395 // These operations are not supported on NEON but SVE can do them.
1396 setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
1397 setOperationAction(ISD::CTLZ, MVT::v1i64, Custom);
1398 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1399 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
1400 setOperationAction(ISD::MULHS, MVT::v1i64, Custom);
1401 setOperationAction(ISD::MULHS, MVT::v2i64, Custom);
1402 setOperationAction(ISD::MULHU, MVT::v1i64, Custom);
1403 setOperationAction(ISD::MULHU, MVT::v2i64, Custom);
1404 setOperationAction(ISD::SMAX, MVT::v1i64, Custom);
1405 setOperationAction(ISD::SMAX, MVT::v2i64, Custom);
1406 setOperationAction(ISD::SMIN, MVT::v1i64, Custom);
1407 setOperationAction(ISD::SMIN, MVT::v2i64, Custom);
1408 setOperationAction(ISD::UMAX, MVT::v1i64, Custom);
1409 setOperationAction(ISD::UMAX, MVT::v2i64, Custom);
1410 setOperationAction(ISD::UMIN, MVT::v1i64, Custom);
1411 setOperationAction(ISD::UMIN, MVT::v2i64, Custom);
1412 setOperationAction(ISD::VECREDUCE_SMAX, MVT::v2i64, Custom);
1413 setOperationAction(ISD::VECREDUCE_SMIN, MVT::v2i64, Custom);
1414 setOperationAction(ISD::VECREDUCE_UMAX, MVT::v2i64, Custom);
1415 setOperationAction(ISD::VECREDUCE_UMIN, MVT::v2i64, Custom);
1417 // Int operations with no NEON support.
1418 for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
1419 MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
1420 setOperationAction(ISD::BITREVERSE, VT, Custom);
1421 setOperationAction(ISD::CTTZ, VT, Custom);
1422 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1423 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1424 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1427 // FP operations with no NEON support.
1428 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32,
1429 MVT::v1f64, MVT::v2f64})
1430 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1432 // Use SVE for vectors with more than 2 elements.
1433 for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
1434 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1437 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
1438 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
1439 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
1440 setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
1442 setOperationAction(ISD::VSCALE, MVT::i32, Custom);
1445 if (Subtarget->hasMOPS() && Subtarget->hasMTE()) {
1446 // Only required for llvm.aarch64.mops.memset.tag
1447 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
1450 PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
1452 IsStrictFPEnabled = true;
1455 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
1456 assert(VT.isVector() && "VT should be a vector type");
1458 if (VT.isFloatingPoint()) {
1459 MVT PromoteTo = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
1460 setOperationPromotedToType(ISD::LOAD, VT, PromoteTo);
1461 setOperationPromotedToType(ISD::STORE, VT, PromoteTo);
1464 // Mark vector float intrinsics as expand.
1465 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
1466 setOperationAction(ISD::FSIN, VT, Expand);
1467 setOperationAction(ISD::FCOS, VT, Expand);
1468 setOperationAction(ISD::FPOW, VT, Expand);
1469 setOperationAction(ISD::FLOG, VT, Expand);
1470 setOperationAction(ISD::FLOG2, VT, Expand);
1471 setOperationAction(ISD::FLOG10, VT, Expand);
1472 setOperationAction(ISD::FEXP, VT, Expand);
1473 setOperationAction(ISD::FEXP2, VT, Expand);
1476 // But we do support custom-lowering for FCOPYSIGN.
1477 if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
1478 ((VT == MVT::v4f16 || VT == MVT::v8f16) && Subtarget->hasFullFP16()))
1479 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1481 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1482 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1483 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1484 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1485 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1486 setOperationAction(ISD::SRA, VT, Custom);
1487 setOperationAction(ISD::SRL, VT, Custom);
1488 setOperationAction(ISD::SHL, VT, Custom);
1489 setOperationAction(ISD::OR, VT, Custom);
1490 setOperationAction(ISD::SETCC, VT, Custom);
1491 setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);
1493 setOperationAction(ISD::SELECT, VT, Expand);
1494 setOperationAction(ISD::SELECT_CC, VT, Expand);
1495 setOperationAction(ISD::VSELECT, VT, Expand);
1496 for (MVT InnerVT : MVT::all_valuetypes())
1497 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
1499 // CNT supports only B element sizes, then use UADDLP to widen.
1500 if (VT != MVT::v8i8 && VT != MVT::v16i8)
1501 setOperationAction(ISD::CTPOP, VT, Custom);
1503 setOperationAction(ISD::UDIV, VT, Expand);
1504 setOperationAction(ISD::SDIV, VT, Expand);
1505 setOperationAction(ISD::UREM, VT, Expand);
1506 setOperationAction(ISD::SREM, VT, Expand);
1507 setOperationAction(ISD::FREM, VT, Expand);
1509 for (unsigned Opcode :
1510 {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::FP_TO_SINT_SAT,
1511 ISD::FP_TO_UINT_SAT, ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT})
1512 setOperationAction(Opcode, VT, Custom);
1514 if (!VT.isFloatingPoint())
1515 setOperationAction(ISD::ABS, VT, Legal);
1517 // [SU][MIN|MAX] are available for all NEON types apart from i64.
1518 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
1519 for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
1520 setOperationAction(Opcode, VT, Legal);
1522 // F[MIN|MAX][NUM|NAN] and simple strict operations are available for all FP
1524 if (VT.isFloatingPoint() &&
1525 VT.getVectorElementType() != MVT::bf16 &&
1526 (VT.getVectorElementType() != MVT::f16 || Subtarget->hasFullFP16()))
1527 for (unsigned Opcode :
1528 {ISD::FMINIMUM, ISD::FMAXIMUM, ISD::FMINNUM, ISD::FMAXNUM,
1529 ISD::STRICT_FMINIMUM, ISD::STRICT_FMAXIMUM, ISD::STRICT_FMINNUM,
1530 ISD::STRICT_FMAXNUM, ISD::STRICT_FADD, ISD::STRICT_FSUB,
1531 ISD::STRICT_FMUL, ISD::STRICT_FDIV, ISD::STRICT_FMA,
1533 setOperationAction(Opcode, VT, Legal);
1535 // Strict fp extend and trunc are legal
1536 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 16)
1537 setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal);
1538 if (VT.isFloatingPoint() && VT.getScalarSizeInBits() != 64)
1539 setOperationAction(ISD::STRICT_FP_ROUND, VT, Legal);
1541 // FIXME: We could potentially make use of the vector comparison instructions
1542 // for STRICT_FSETCC and STRICT_FSETCSS, but there's a number of
1544 // * FCMPEQ/NE are quiet comparisons, the rest are signalling comparisons,
1545 // so we would need to expand when the condition code doesn't match the
1546 // kind of comparison.
1547 // * Some kinds of comparison require more than one FCMXY instruction so
1548 // would need to be expanded instead.
1549 // * The lowering of the non-strict versions involves target-specific ISD
1550 // nodes so we would likely need to add strict versions of all of them and
1551 // handle them appropriately.
1552 setOperationAction(ISD::STRICT_FSETCC, VT, Expand);
1553 setOperationAction(ISD::STRICT_FSETCCS, VT, Expand);
1555 if (Subtarget->isLittleEndian()) {
1556 for (unsigned im = (unsigned)ISD::PRE_INC;
1557 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
1558 setIndexedLoadAction(im, VT, Legal);
1559 setIndexedStoreAction(im, VT, Legal);
1564 bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
1566 // Only SVE has a 1:1 mapping from intrinsic -> instruction (whilelo).
1567 if (!Subtarget->hasSVE())
1570 // We can only support legal predicate result types. We can use the SVE
1571 // whilelo instruction for generating fixed-width predicates too.
1572 if (ResVT != MVT::nxv2i1 && ResVT != MVT::nxv4i1 && ResVT != MVT::nxv8i1 &&
1573 ResVT != MVT::nxv16i1 && ResVT != MVT::v2i1 && ResVT != MVT::v4i1 &&
1574 ResVT != MVT::v8i1 && ResVT != MVT::v16i1)
1577 // The whilelo instruction only works with i32 or i64 scalar inputs.
1578 if (OpVT != MVT::i32 && OpVT != MVT::i64)
1584 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
1585 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
1587 // By default everything must be expanded.
1588 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
1589 setOperationAction(Op, VT, Expand);
1591 // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
1592 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1594 if (VT.isFloatingPoint()) {
1595 setCondCodeAction(ISD::SETO, VT, Expand);
1596 setCondCodeAction(ISD::SETOLT, VT, Expand);
1597 setCondCodeAction(ISD::SETLT, VT, Expand);
1598 setCondCodeAction(ISD::SETOLE, VT, Expand);
1599 setCondCodeAction(ISD::SETLE, VT, Expand);
1600 setCondCodeAction(ISD::SETULT, VT, Expand);
1601 setCondCodeAction(ISD::SETULE, VT, Expand);
1602 setCondCodeAction(ISD::SETUGE, VT, Expand);
1603 setCondCodeAction(ISD::SETUGT, VT, Expand);
1604 setCondCodeAction(ISD::SETUEQ, VT, Expand);
1605 setCondCodeAction(ISD::SETONE, VT, Expand);
1608 // Mark integer truncating stores/extending loads as having custom lowering
1609 if (VT.isInteger()) {
1610 MVT InnerVT = VT.changeVectorElementType(MVT::i8);
1611 while (InnerVT != VT) {
1612 setTruncStoreAction(VT, InnerVT, Custom);
1613 setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
1614 setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
1615 InnerVT = InnerVT.changeVectorElementType(
1616 MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
1620 // Mark floating-point truncating stores/extending loads as having custom
1622 if (VT.isFloatingPoint()) {
1623 MVT InnerVT = VT.changeVectorElementType(MVT::f16);
1624 while (InnerVT != VT) {
1625 setTruncStoreAction(VT, InnerVT, Custom);
1626 setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Custom);
1627 InnerVT = InnerVT.changeVectorElementType(
1628 MVT::getFloatingPointVT(2 * InnerVT.getScalarSizeInBits()));
1632 // Lower fixed length vector operations to scalable equivalents.
1633 setOperationAction(ISD::ABS, VT, Custom);
1634 setOperationAction(ISD::ADD, VT, Custom);
1635 setOperationAction(ISD::AND, VT, Custom);
1636 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1637 setOperationAction(ISD::BITCAST, VT, Custom);
1638 setOperationAction(ISD::BITREVERSE, VT, Custom);
1639 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1640 setOperationAction(ISD::BSWAP, VT, Custom);
1641 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1642 setOperationAction(ISD::CTLZ, VT, Custom);
1643 setOperationAction(ISD::CTPOP, VT, Custom);
1644 setOperationAction(ISD::CTTZ, VT, Custom);
1645 setOperationAction(ISD::FABS, VT, Custom);
1646 setOperationAction(ISD::FADD, VT, Custom);
1647 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1648 setOperationAction(ISD::FCEIL, VT, Custom);
1649 setOperationAction(ISD::FDIV, VT, Custom);
1650 setOperationAction(ISD::FFLOOR, VT, Custom);
1651 setOperationAction(ISD::FMA, VT, Custom);
1652 setOperationAction(ISD::FMAXIMUM, VT, Custom);
1653 setOperationAction(ISD::FMAXNUM, VT, Custom);
1654 setOperationAction(ISD::FMINIMUM, VT, Custom);
1655 setOperationAction(ISD::FMINNUM, VT, Custom);
1656 setOperationAction(ISD::FMUL, VT, Custom);
1657 setOperationAction(ISD::FNEARBYINT, VT, Custom);
1658 setOperationAction(ISD::FNEG, VT, Custom);
1659 setOperationAction(ISD::FP_EXTEND, VT, Custom);
1660 setOperationAction(ISD::FP_ROUND, VT, Custom);
1661 setOperationAction(ISD::FP_TO_SINT, VT, Custom);
1662 setOperationAction(ISD::FP_TO_UINT, VT, Custom);
1663 setOperationAction(ISD::FRINT, VT, Custom);
1664 setOperationAction(ISD::FROUND, VT, Custom);
1665 setOperationAction(ISD::FROUNDEVEN, VT, Custom);
1666 setOperationAction(ISD::FSQRT, VT, Custom);
1667 setOperationAction(ISD::FSUB, VT, Custom);
1668 setOperationAction(ISD::FTRUNC, VT, Custom);
1669 setOperationAction(ISD::LOAD, VT, Custom);
1670 setOperationAction(ISD::MGATHER, VT, Custom);
1671 setOperationAction(ISD::MLOAD, VT, Custom);
1672 setOperationAction(ISD::MSCATTER, VT, Custom);
1673 setOperationAction(ISD::MSTORE, VT, Custom);
1674 setOperationAction(ISD::MUL, VT, Custom);
1675 setOperationAction(ISD::MULHS, VT, Custom);
1676 setOperationAction(ISD::MULHU, VT, Custom);
1677 setOperationAction(ISD::OR, VT, Custom);
1678 setOperationAction(ISD::SDIV, VT, Custom);
1679 setOperationAction(ISD::SELECT, VT, Custom);
1680 setOperationAction(ISD::SETCC, VT, Custom);
1681 setOperationAction(ISD::SHL, VT, Custom);
1682 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1683 setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
1684 setOperationAction(ISD::SINT_TO_FP, VT, Custom);
1685 setOperationAction(ISD::SMAX, VT, Custom);
1686 setOperationAction(ISD::SMIN, VT, Custom);
1687 setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
1688 setOperationAction(ISD::VECTOR_SPLICE, VT, Custom);
1689 setOperationAction(ISD::SRA, VT, Custom);
1690 setOperationAction(ISD::SRL, VT, Custom);
1691 setOperationAction(ISD::STORE, VT, Custom);
1692 setOperationAction(ISD::SUB, VT, Custom);
1693 setOperationAction(ISD::TRUNCATE, VT, Custom);
1694 setOperationAction(ISD::UDIV, VT, Custom);
1695 setOperationAction(ISD::UINT_TO_FP, VT, Custom);
1696 setOperationAction(ISD::UMAX, VT, Custom);
1697 setOperationAction(ISD::UMIN, VT, Custom);
1698 setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
1699 setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
1700 setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
1701 setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
1702 setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
1703 setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
1704 setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
1705 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1706 setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
1707 setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
1708 setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
1709 setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
1710 setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
1711 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1712 setOperationAction(ISD::VSELECT, VT, Custom);
1713 setOperationAction(ISD::XOR, VT, Custom);
1714 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1717 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
1718 addRegisterClass(VT, &AArch64::FPR64RegClass);
1722 void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
1723 addRegisterClass(VT, &AArch64::FPR128RegClass);
1727 EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
1728 LLVMContext &C, EVT VT) const {
1731 if (VT.isScalableVector())
1732 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1733 return VT.changeVectorElementTypeToInteger();
1736 static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
1737 const APInt &Demanded,
1738 TargetLowering::TargetLoweringOpt &TLO,
1740 uint64_t OldImm = Imm, NewImm, Enc;
1741 uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;
1743 // Return if the immediate is already all zeros, all ones, a bimm32 or a
1745 if (Imm == 0 || Imm == Mask ||
1746 AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
1749 unsigned EltSize = Size;
1750 uint64_t DemandedBits = Demanded.getZExtValue();
1752 // Clear bits that are not demanded.
1753 Imm &= DemandedBits;
1756 // The goal here is to set the non-demanded bits in a way that minimizes
1757 // the number of switching between 0 and 1. In order to achieve this goal,
1758 // we set the non-demanded bits to the value of the preceding demanded bits.
1759 // For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
1760 // non-demanded bit), we copy bit0 (1) to the least significant 'x',
1761 // bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
1762 // The final result is 0b11000011.
1763 uint64_t NonDemandedBits = ~DemandedBits;
1764 uint64_t InvertedImm = ~Imm & DemandedBits;
1765 uint64_t RotatedImm =
1766 ((InvertedImm << 1) | (InvertedImm >> (EltSize - 1) & 1)) &
1768 uint64_t Sum = RotatedImm + NonDemandedBits;
1769 bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
1770 uint64_t Ones = (Sum + Carry) & NonDemandedBits;
1771 NewImm = (Imm | Ones) & Mask;
1773 // If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
1774 // or all-ones or all-zeros, in which case we can stop searching. Otherwise,
1775 // we halve the element size and continue the search.
1776 if (isShiftedMask_64(NewImm) || isShiftedMask_64(~(NewImm | ~Mask)))
1779 // We cannot shrink the element size any further if it is 2-bits.
1785 uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;
1787 // Return if there is mismatch in any of the demanded bits of Imm and Hi.
1788 if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
1791 // Merge the upper and lower halves of Imm and DemandedBits.
1793 DemandedBits |= DemandedBitsHi;
1798 // Replicate the element across the register width.
1799 while (EltSize < Size) {
1800 NewImm |= NewImm << EltSize;
1805 assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
1806 "demanded bits should never be altered");
1807 assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");
1809 // Create the new constant immediate node.
1810 EVT VT = Op.getValueType();
1814 // If the new constant immediate is all-zeros or all-ones, let the target
1815 // independent DAG combine optimize this node.
1816 if (NewImm == 0 || NewImm == OrigMask) {
1817 New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
1818 TLO.DAG.getConstant(NewImm, DL, VT));
1819 // Otherwise, create a machine node so that target independent DAG combine
1820 // doesn't undo this optimization.
1822 Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
1823 SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
1825 TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
1828 return TLO.CombineTo(Op, New);
1831 bool AArch64TargetLowering::targetShrinkDemandedConstant(
1832 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
1833 TargetLoweringOpt &TLO) const {
1834 // Delay this optimization to as late as possible.
1838 if (!EnableOptimizeLogicalImm)
1841 EVT VT = Op.getValueType();
1845 unsigned Size = VT.getSizeInBits();
1846 assert((Size == 32 || Size == 64) &&
1847 "i32 or i64 is expected after legalization.");
1849 // Exit early if we demand all bits.
1850 if (DemandedBits.countPopulation() == Size)
1854 switch (Op.getOpcode()) {
1858 NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
1861 NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
1864 NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
1867 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
1870 uint64_t Imm = C->getZExtValue();
1871 return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
1874 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
1875 /// Mask are known to be either zero or one and return them Known.
1876 void AArch64TargetLowering::computeKnownBitsForTargetNode(
1877 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
1878 const SelectionDAG &DAG, unsigned Depth) const {
1879 switch (Op.getOpcode()) {
1882 case AArch64ISD::DUP: {
1883 SDValue SrcOp = Op.getOperand(0);
1884 Known = DAG.computeKnownBits(SrcOp, Depth + 1);
1885 if (SrcOp.getValueSizeInBits() != Op.getScalarValueSizeInBits()) {
1886 assert(SrcOp.getValueSizeInBits() > Op.getScalarValueSizeInBits() &&
1887 "Expected DUP implicit truncation");
1888 Known = Known.trunc(Op.getScalarValueSizeInBits());
1892 case AArch64ISD::CSEL: {
1894 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1895 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1896 Known = KnownBits::commonBits(Known, Known2);
1899 case AArch64ISD::BICi: {
1900 // Compute the bit cleared value.
1902 ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
1903 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1904 Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
1907 case AArch64ISD::VLSHR: {
1909 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1910 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1911 Known = KnownBits::lshr(Known, Known2);
1914 case AArch64ISD::VASHR: {
1916 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1917 Known2 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
1918 Known = KnownBits::ashr(Known, Known2);
1921 case AArch64ISD::LOADgot:
1922 case AArch64ISD::ADDlow: {
1923 if (!Subtarget->isTargetILP32())
1925 // In ILP32 mode all valid pointers are in the low 4GB of the address-space.
1926 Known.Zero = APInt::getHighBitsSet(64, 32);
1929 case AArch64ISD::ASSERT_ZEXT_BOOL: {
1930 Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
1931 Known.Zero |= APInt(Known.getBitWidth(), 0xFE);
1934 case ISD::INTRINSIC_W_CHAIN: {
1935 ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
1936 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
1939 case Intrinsic::aarch64_ldaxr:
1940 case Intrinsic::aarch64_ldxr: {
1941 unsigned BitWidth = Known.getBitWidth();
1942 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
1943 unsigned MemBits = VT.getScalarSizeInBits();
1944 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
1950 case ISD::INTRINSIC_WO_CHAIN:
1951 case ISD::INTRINSIC_VOID: {
1952 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
1956 case Intrinsic::aarch64_neon_umaxv:
1957 case Intrinsic::aarch64_neon_uminv: {
1958 // Figure out the datatype of the vector operand. The UMINV instruction
1959 // will zero extend the result, so we can mark as known zero all the
1960 // bits larger than the element datatype. 32-bit or larget doesn't need
1961 // this as those are legal types and will be handled by isel directly.
1962 MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
1963 unsigned BitWidth = Known.getBitWidth();
1964 if (VT == MVT::v8i8 || VT == MVT::v16i8) {
1965 assert(BitWidth >= 8 && "Unexpected width!");
1966 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
1968 } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
1969 assert(BitWidth >= 16 && "Unexpected width!");
1970 APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
1980 MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
1985 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
1986 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1988 if (Subtarget->requiresStrictAlign())
1992 // Some CPUs are fine with unaligned stores except for 128-bit ones.
1993 *Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
1994 // See comments in performSTORECombine() for more details about
1995 // these conditions.
1997 // Code that uses clang vector extensions can mark that it
1998 // wants unaligned accesses to be treated as fast by
1999 // underspecifying alignment to be 1 or 2.
2002 // Disregard v2i64. Memcpy lowering produces those and splitting
2003 // them regresses performance on micro-benchmarks and olden/bh.
2009 // Same as above but handling LLTs instead.
2010 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
2011 LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2013 if (Subtarget->requiresStrictAlign())
2017 // Some CPUs are fine with unaligned stores except for 128-bit ones.
2018 *Fast = !Subtarget->isMisaligned128StoreSlow() ||
2019 Ty.getSizeInBytes() != 16 ||
2020 // See comments in performSTORECombine() for more details about
2021 // these conditions.
2023 // Code that uses clang vector extensions can mark that it
2024 // wants unaligned accesses to be treated as fast by
2025 // underspecifying alignment to be 1 or 2.
2028 // Disregard v2i64. Memcpy lowering produces those and splitting
2029 // them regresses performance on micro-benchmarks and olden/bh.
2030 Ty == LLT::fixed_vector(2, 64);
2036 AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
2037 const TargetLibraryInfo *libInfo) const {
2038 return AArch64::createFastISel(funcInfo, libInfo);
2041 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2042 #define MAKE_CASE(V) \
2045 switch ((AArch64ISD::NodeType)Opcode) {
2046 case AArch64ISD::FIRST_NUMBER:
2048 MAKE_CASE(AArch64ISD::CALL)
2049 MAKE_CASE(AArch64ISD::ADRP)
2050 MAKE_CASE(AArch64ISD::ADR)
2051 MAKE_CASE(AArch64ISD::ADDlow)
2052 MAKE_CASE(AArch64ISD::LOADgot)
2053 MAKE_CASE(AArch64ISD::RET_FLAG)
2054 MAKE_CASE(AArch64ISD::BRCOND)
2055 MAKE_CASE(AArch64ISD::CSEL)
2056 MAKE_CASE(AArch64ISD::CSINV)
2057 MAKE_CASE(AArch64ISD::CSNEG)
2058 MAKE_CASE(AArch64ISD::CSINC)
2059 MAKE_CASE(AArch64ISD::THREAD_POINTER)
2060 MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
2061 MAKE_CASE(AArch64ISD::ABDS_PRED)
2062 MAKE_CASE(AArch64ISD::ABDU_PRED)
2063 MAKE_CASE(AArch64ISD::MUL_PRED)
2064 MAKE_CASE(AArch64ISD::MULHS_PRED)
2065 MAKE_CASE(AArch64ISD::MULHU_PRED)
2066 MAKE_CASE(AArch64ISD::SDIV_PRED)
2067 MAKE_CASE(AArch64ISD::SHL_PRED)
2068 MAKE_CASE(AArch64ISD::SMAX_PRED)
2069 MAKE_CASE(AArch64ISD::SMIN_PRED)
2070 MAKE_CASE(AArch64ISD::SRA_PRED)
2071 MAKE_CASE(AArch64ISD::SRL_PRED)
2072 MAKE_CASE(AArch64ISD::UDIV_PRED)
2073 MAKE_CASE(AArch64ISD::UMAX_PRED)
2074 MAKE_CASE(AArch64ISD::UMIN_PRED)
2075 MAKE_CASE(AArch64ISD::SRAD_MERGE_OP1)
2076 MAKE_CASE(AArch64ISD::FNEG_MERGE_PASSTHRU)
2077 MAKE_CASE(AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU)
2078 MAKE_CASE(AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU)
2079 MAKE_CASE(AArch64ISD::FCEIL_MERGE_PASSTHRU)
2080 MAKE_CASE(AArch64ISD::FFLOOR_MERGE_PASSTHRU)
2081 MAKE_CASE(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU)
2082 MAKE_CASE(AArch64ISD::FRINT_MERGE_PASSTHRU)
2083 MAKE_CASE(AArch64ISD::FROUND_MERGE_PASSTHRU)
2084 MAKE_CASE(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU)
2085 MAKE_CASE(AArch64ISD::FTRUNC_MERGE_PASSTHRU)
2086 MAKE_CASE(AArch64ISD::FP_ROUND_MERGE_PASSTHRU)
2087 MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU)
2088 MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU)
2089 MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU)
2090 MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU)
2091 MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU)
2092 MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU)
2093 MAKE_CASE(AArch64ISD::FRECPX_MERGE_PASSTHRU)
2094 MAKE_CASE(AArch64ISD::FABS_MERGE_PASSTHRU)
2095 MAKE_CASE(AArch64ISD::ABS_MERGE_PASSTHRU)
2096 MAKE_CASE(AArch64ISD::NEG_MERGE_PASSTHRU)
2097 MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
2098 MAKE_CASE(AArch64ISD::ADC)
2099 MAKE_CASE(AArch64ISD::SBC)
2100 MAKE_CASE(AArch64ISD::ADDS)
2101 MAKE_CASE(AArch64ISD::SUBS)
2102 MAKE_CASE(AArch64ISD::ADCS)
2103 MAKE_CASE(AArch64ISD::SBCS)
2104 MAKE_CASE(AArch64ISD::ANDS)
2105 MAKE_CASE(AArch64ISD::CCMP)
2106 MAKE_CASE(AArch64ISD::CCMN)
2107 MAKE_CASE(AArch64ISD::FCCMP)
2108 MAKE_CASE(AArch64ISD::FCMP)
2109 MAKE_CASE(AArch64ISD::STRICT_FCMP)
2110 MAKE_CASE(AArch64ISD::STRICT_FCMPE)
2111 MAKE_CASE(AArch64ISD::DUP)
2112 MAKE_CASE(AArch64ISD::DUPLANE8)
2113 MAKE_CASE(AArch64ISD::DUPLANE16)
2114 MAKE_CASE(AArch64ISD::DUPLANE32)
2115 MAKE_CASE(AArch64ISD::DUPLANE64)
2116 MAKE_CASE(AArch64ISD::DUPLANE128)
2117 MAKE_CASE(AArch64ISD::MOVI)
2118 MAKE_CASE(AArch64ISD::MOVIshift)
2119 MAKE_CASE(AArch64ISD::MOVIedit)
2120 MAKE_CASE(AArch64ISD::MOVImsl)
2121 MAKE_CASE(AArch64ISD::FMOV)
2122 MAKE_CASE(AArch64ISD::MVNIshift)
2123 MAKE_CASE(AArch64ISD::MVNImsl)
2124 MAKE_CASE(AArch64ISD::BICi)
2125 MAKE_CASE(AArch64ISD::ORRi)
2126 MAKE_CASE(AArch64ISD::BSP)
2127 MAKE_CASE(AArch64ISD::EXTR)
2128 MAKE_CASE(AArch64ISD::ZIP1)
2129 MAKE_CASE(AArch64ISD::ZIP2)
2130 MAKE_CASE(AArch64ISD::UZP1)
2131 MAKE_CASE(AArch64ISD::UZP2)
2132 MAKE_CASE(AArch64ISD::TRN1)
2133 MAKE_CASE(AArch64ISD::TRN2)
2134 MAKE_CASE(AArch64ISD::REV16)
2135 MAKE_CASE(AArch64ISD::REV32)
2136 MAKE_CASE(AArch64ISD::REV64)
2137 MAKE_CASE(AArch64ISD::EXT)
2138 MAKE_CASE(AArch64ISD::SPLICE)
2139 MAKE_CASE(AArch64ISD::VSHL)
2140 MAKE_CASE(AArch64ISD::VLSHR)
2141 MAKE_CASE(AArch64ISD::VASHR)
2142 MAKE_CASE(AArch64ISD::VSLI)
2143 MAKE_CASE(AArch64ISD::VSRI)
2144 MAKE_CASE(AArch64ISD::CMEQ)
2145 MAKE_CASE(AArch64ISD::CMGE)
2146 MAKE_CASE(AArch64ISD::CMGT)
2147 MAKE_CASE(AArch64ISD::CMHI)
2148 MAKE_CASE(AArch64ISD::CMHS)
2149 MAKE_CASE(AArch64ISD::FCMEQ)
2150 MAKE_CASE(AArch64ISD::FCMGE)
2151 MAKE_CASE(AArch64ISD::FCMGT)
2152 MAKE_CASE(AArch64ISD::CMEQz)
2153 MAKE_CASE(AArch64ISD::CMGEz)
2154 MAKE_CASE(AArch64ISD::CMGTz)
2155 MAKE_CASE(AArch64ISD::CMLEz)
2156 MAKE_CASE(AArch64ISD::CMLTz)
2157 MAKE_CASE(AArch64ISD::FCMEQz)
2158 MAKE_CASE(AArch64ISD::FCMGEz)
2159 MAKE_CASE(AArch64ISD::FCMGTz)
2160 MAKE_CASE(AArch64ISD::FCMLEz)
2161 MAKE_CASE(AArch64ISD::FCMLTz)
2162 MAKE_CASE(AArch64ISD::SADDV)
2163 MAKE_CASE(AArch64ISD::UADDV)
2164 MAKE_CASE(AArch64ISD::SDOT)
2165 MAKE_CASE(AArch64ISD::UDOT)
2166 MAKE_CASE(AArch64ISD::SMINV)
2167 MAKE_CASE(AArch64ISD::UMINV)
2168 MAKE_CASE(AArch64ISD::SMAXV)
2169 MAKE_CASE(AArch64ISD::UMAXV)
2170 MAKE_CASE(AArch64ISD::SADDV_PRED)
2171 MAKE_CASE(AArch64ISD::UADDV_PRED)
2172 MAKE_CASE(AArch64ISD::SMAXV_PRED)
2173 MAKE_CASE(AArch64ISD::UMAXV_PRED)
2174 MAKE_CASE(AArch64ISD::SMINV_PRED)
2175 MAKE_CASE(AArch64ISD::UMINV_PRED)
2176 MAKE_CASE(AArch64ISD::ORV_PRED)
2177 MAKE_CASE(AArch64ISD::EORV_PRED)
2178 MAKE_CASE(AArch64ISD::ANDV_PRED)
2179 MAKE_CASE(AArch64ISD::CLASTA_N)
2180 MAKE_CASE(AArch64ISD::CLASTB_N)
2181 MAKE_CASE(AArch64ISD::LASTA)
2182 MAKE_CASE(AArch64ISD::LASTB)
2183 MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
2184 MAKE_CASE(AArch64ISD::LS64_BUILD)
2185 MAKE_CASE(AArch64ISD::LS64_EXTRACT)
2186 MAKE_CASE(AArch64ISD::TBL)
2187 MAKE_CASE(AArch64ISD::FADD_PRED)
2188 MAKE_CASE(AArch64ISD::FADDA_PRED)
2189 MAKE_CASE(AArch64ISD::FADDV_PRED)
2190 MAKE_CASE(AArch64ISD::FDIV_PRED)
2191 MAKE_CASE(AArch64ISD::FMA_PRED)
2192 MAKE_CASE(AArch64ISD::FMAX_PRED)
2193 MAKE_CASE(AArch64ISD::FMAXV_PRED)
2194 MAKE_CASE(AArch64ISD::FMAXNM_PRED)
2195 MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
2196 MAKE_CASE(AArch64ISD::FMIN_PRED)
2197 MAKE_CASE(AArch64ISD::FMINV_PRED)
2198 MAKE_CASE(AArch64ISD::FMINNM_PRED)
2199 MAKE_CASE(AArch64ISD::FMINNMV_PRED)
2200 MAKE_CASE(AArch64ISD::FMUL_PRED)
2201 MAKE_CASE(AArch64ISD::FSUB_PRED)
2202 MAKE_CASE(AArch64ISD::RDSVL)
2203 MAKE_CASE(AArch64ISD::BIC)
2204 MAKE_CASE(AArch64ISD::BIT)
2205 MAKE_CASE(AArch64ISD::CBZ)
2206 MAKE_CASE(AArch64ISD::CBNZ)
2207 MAKE_CASE(AArch64ISD::TBZ)
2208 MAKE_CASE(AArch64ISD::TBNZ)
2209 MAKE_CASE(AArch64ISD::TC_RETURN)
2210 MAKE_CASE(AArch64ISD::PREFETCH)
2211 MAKE_CASE(AArch64ISD::SITOF)
2212 MAKE_CASE(AArch64ISD::UITOF)
2213 MAKE_CASE(AArch64ISD::NVCAST)
2214 MAKE_CASE(AArch64ISD::MRS)
2215 MAKE_CASE(AArch64ISD::SQSHL_I)
2216 MAKE_CASE(AArch64ISD::UQSHL_I)
2217 MAKE_CASE(AArch64ISD::SRSHR_I)
2218 MAKE_CASE(AArch64ISD::URSHR_I)
2219 MAKE_CASE(AArch64ISD::SQSHLU_I)
2220 MAKE_CASE(AArch64ISD::WrapperLarge)
2221 MAKE_CASE(AArch64ISD::LD2post)
2222 MAKE_CASE(AArch64ISD::LD3post)
2223 MAKE_CASE(AArch64ISD::LD4post)
2224 MAKE_CASE(AArch64ISD::ST2post)
2225 MAKE_CASE(AArch64ISD::ST3post)
2226 MAKE_CASE(AArch64ISD::ST4post)
2227 MAKE_CASE(AArch64ISD::LD1x2post)
2228 MAKE_CASE(AArch64ISD::LD1x3post)
2229 MAKE_CASE(AArch64ISD::LD1x4post)
2230 MAKE_CASE(AArch64ISD::ST1x2post)
2231 MAKE_CASE(AArch64ISD::ST1x3post)
2232 MAKE_CASE(AArch64ISD::ST1x4post)
2233 MAKE_CASE(AArch64ISD::LD1DUPpost)
2234 MAKE_CASE(AArch64ISD::LD2DUPpost)
2235 MAKE_CASE(AArch64ISD::LD3DUPpost)
2236 MAKE_CASE(AArch64ISD::LD4DUPpost)
2237 MAKE_CASE(AArch64ISD::LD1LANEpost)
2238 MAKE_CASE(AArch64ISD::LD2LANEpost)
2239 MAKE_CASE(AArch64ISD::LD3LANEpost)
2240 MAKE_CASE(AArch64ISD::LD4LANEpost)
2241 MAKE_CASE(AArch64ISD::ST2LANEpost)
2242 MAKE_CASE(AArch64ISD::ST3LANEpost)
2243 MAKE_CASE(AArch64ISD::ST4LANEpost)
2244 MAKE_CASE(AArch64ISD::SMULL)
2245 MAKE_CASE(AArch64ISD::UMULL)
2246 MAKE_CASE(AArch64ISD::FRECPE)
2247 MAKE_CASE(AArch64ISD::FRECPS)
2248 MAKE_CASE(AArch64ISD::FRSQRTE)
2249 MAKE_CASE(AArch64ISD::FRSQRTS)
2250 MAKE_CASE(AArch64ISD::STG)
2251 MAKE_CASE(AArch64ISD::STZG)
2252 MAKE_CASE(AArch64ISD::ST2G)
2253 MAKE_CASE(AArch64ISD::STZ2G)
2254 MAKE_CASE(AArch64ISD::SUNPKHI)
2255 MAKE_CASE(AArch64ISD::SUNPKLO)
2256 MAKE_CASE(AArch64ISD::UUNPKHI)
2257 MAKE_CASE(AArch64ISD::UUNPKLO)
2258 MAKE_CASE(AArch64ISD::INSR)
2259 MAKE_CASE(AArch64ISD::PTEST)
2260 MAKE_CASE(AArch64ISD::PTRUE)
2261 MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
2262 MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
2263 MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
2264 MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
2265 MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
2266 MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
2267 MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
2268 MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
2269 MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
2270 MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
2271 MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
2272 MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
2273 MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
2274 MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
2275 MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
2276 MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
2277 MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2278 MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2279 MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2280 MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2281 MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
2282 MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
2283 MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
2284 MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
2285 MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
2286 MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
2287 MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
2288 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
2289 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
2290 MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
2291 MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
2292 MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
2293 MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
2294 MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
2295 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
2296 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
2297 MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
2298 MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
2299 MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
2300 MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
2301 MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2302 MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2303 MAKE_CASE(AArch64ISD::ST1_PRED)
2304 MAKE_CASE(AArch64ISD::SST1_PRED)
2305 MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
2306 MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
2307 MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
2308 MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
2309 MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
2310 MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
2311 MAKE_CASE(AArch64ISD::SSTNT1_PRED)
2312 MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
2313 MAKE_CASE(AArch64ISD::LDP)
2314 MAKE_CASE(AArch64ISD::STP)
2315 MAKE_CASE(AArch64ISD::STNP)
2316 MAKE_CASE(AArch64ISD::BITREVERSE_MERGE_PASSTHRU)
2317 MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU)
2318 MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU)
2319 MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU)
2320 MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU)
2321 MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU)
2322 MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU)
2323 MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
2324 MAKE_CASE(AArch64ISD::INDEX_VECTOR)
2325 MAKE_CASE(AArch64ISD::ADDP)
2326 MAKE_CASE(AArch64ISD::SADDLP)
2327 MAKE_CASE(AArch64ISD::UADDLP)
2328 MAKE_CASE(AArch64ISD::CALL_RVMARKER)
2329 MAKE_CASE(AArch64ISD::ASSERT_ZEXT_BOOL)
2330 MAKE_CASE(AArch64ISD::MOPS_MEMSET)
2331 MAKE_CASE(AArch64ISD::MOPS_MEMSET_TAGGING)
2332 MAKE_CASE(AArch64ISD::MOPS_MEMCOPY)
2333 MAKE_CASE(AArch64ISD::MOPS_MEMMOVE)
2334 MAKE_CASE(AArch64ISD::CALL_BTI)
2341 AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
2342 MachineBasicBlock *MBB) const {
2343 // We materialise the F128CSEL pseudo-instruction as some control flow and a
2347 // [... previous instrs leading to comparison ...]
2353 // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
2355 MachineFunction *MF = MBB->getParent();
2356 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2357 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
2358 DebugLoc DL = MI.getDebugLoc();
2359 MachineFunction::iterator It = ++MBB->getIterator();
2361 Register DestReg = MI.getOperand(0).getReg();
2362 Register IfTrueReg = MI.getOperand(1).getReg();
2363 Register IfFalseReg = MI.getOperand(2).getReg();
2364 unsigned CondCode = MI.getOperand(3).getImm();
2365 bool NZCVKilled = MI.getOperand(4).isKill();
2367 MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
2368 MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
2369 MF->insert(It, TrueBB);
2370 MF->insert(It, EndBB);
2372 // Transfer rest of current basic-block to EndBB
2373 EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
2375 EndBB->transferSuccessorsAndUpdatePHIs(MBB);
2377 BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
2378 BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
2379 MBB->addSuccessor(TrueBB);
2380 MBB->addSuccessor(EndBB);
2382 // TrueBB falls through to the end.
2383 TrueBB->addSuccessor(EndBB);
2386 TrueBB->addLiveIn(AArch64::NZCV);
2387 EndBB->addLiveIn(AArch64::NZCV);
2390 BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
2396 MI.eraseFromParent();
2400 MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
2401 MachineInstr &MI, MachineBasicBlock *BB) const {
2402 assert(!isAsynchronousEHPersonality(classifyEHPersonality(
2403 BB->getParent()->getFunction().getPersonalityFn())) &&
2404 "SEH does not use catchret!");
2409 AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg,
2411 MachineBasicBlock *BB) const {
2412 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2413 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2415 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2416 MIB.add(MI.getOperand(1)); // slice index register
2417 MIB.add(MI.getOperand(2)); // slice index offset
2418 MIB.add(MI.getOperand(3)); // pg
2419 MIB.add(MI.getOperand(4)); // base
2420 MIB.add(MI.getOperand(5)); // offset
2422 MI.eraseFromParent(); // The pseudo is gone now.
2427 AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const {
2428 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2429 MachineInstrBuilder MIB =
2430 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA));
2432 MIB.addReg(AArch64::ZA, RegState::Define);
2433 MIB.add(MI.getOperand(0)); // Vector select register
2434 MIB.add(MI.getOperand(1)); // Vector select offset
2435 MIB.add(MI.getOperand(2)); // Base
2436 MIB.add(MI.getOperand(1)); // Offset, same as vector select offset
2438 MI.eraseFromParent(); // The pseudo is gone now.
2443 AArch64TargetLowering::EmitMopa(unsigned Opc, unsigned BaseReg,
2444 MachineInstr &MI, MachineBasicBlock *BB) const {
2445 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2446 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2448 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2449 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2450 MIB.add(MI.getOperand(1)); // pn
2451 MIB.add(MI.getOperand(2)); // pm
2452 MIB.add(MI.getOperand(3)); // zn
2453 MIB.add(MI.getOperand(4)); // zm
2455 MI.eraseFromParent(); // The pseudo is gone now.
2460 AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg,
2462 MachineBasicBlock *BB) const {
2463 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2464 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2466 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2467 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2468 MIB.add(MI.getOperand(1)); // Slice index register
2469 MIB.add(MI.getOperand(2)); // Slice index offset
2470 MIB.add(MI.getOperand(3)); // pg
2471 MIB.add(MI.getOperand(4)); // zn
2473 MI.eraseFromParent(); // The pseudo is gone now.
2478 AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2479 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2480 MachineInstrBuilder MIB =
2481 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::ZERO_M));
2482 MIB.add(MI.getOperand(0)); // Mask
2484 unsigned Mask = MI.getOperand(0).getImm();
2485 for (unsigned I = 0; I < 8; I++) {
2486 if (Mask & (1 << I))
2487 MIB.addDef(AArch64::ZAD0 + I, RegState::ImplicitDefine);
2490 MI.eraseFromParent(); // The pseudo is gone now.
2495 AArch64TargetLowering::EmitAddVectorToTile(unsigned Opc, unsigned BaseReg,
2497 MachineBasicBlock *BB) const {
2498 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2499 MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc));
2501 MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define);
2502 MIB.addReg(BaseReg + MI.getOperand(0).getImm());
2503 MIB.add(MI.getOperand(1)); // pn
2504 MIB.add(MI.getOperand(2)); // pm
2505 MIB.add(MI.getOperand(3)); // zn
2507 MI.eraseFromParent(); // The pseudo is gone now.
2511 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2512 MachineInstr &MI, MachineBasicBlock *BB) const {
2513 switch (MI.getOpcode()) {
2518 llvm_unreachable("Unexpected instruction for custom inserter!");
2520 case AArch64::F128CSEL:
2521 return EmitF128CSEL(MI, BB);
2523 case TargetOpcode::STATEPOINT:
2524 // STATEPOINT is a pseudo instruction which has no implicit defs/uses
2525 // while bl call instruction (where statepoint will be lowered at the end)
2526 // has implicit def. This def is early-clobber as it will be set at
2527 // the moment of the call and earlier than any use is read.
2528 // Add this implicit dead def here as a workaround.
2529 MI.addOperand(*MI.getMF(),
2530 MachineOperand::CreateReg(
2531 AArch64::LR, /*isDef*/ true,
2532 /*isImp*/ true, /*isKill*/ false, /*isDead*/ true,
2533 /*isUndef*/ false, /*isEarlyClobber*/ true));
2535 case TargetOpcode::STACKMAP:
2536 case TargetOpcode::PATCHPOINT:
2537 return emitPatchPoint(MI, BB);
2539 case AArch64::CATCHRET:
2540 return EmitLoweredCatchRet(MI, BB);
2541 case AArch64::LD1_MXIPXX_H_PSEUDO_B:
2542 return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB);
2543 case AArch64::LD1_MXIPXX_H_PSEUDO_H:
2544 return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB);
2545 case AArch64::LD1_MXIPXX_H_PSEUDO_S:
2546 return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB);
2547 case AArch64::LD1_MXIPXX_H_PSEUDO_D:
2548 return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB);
2549 case AArch64::LD1_MXIPXX_H_PSEUDO_Q:
2550 return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB);
2551 case AArch64::LD1_MXIPXX_V_PSEUDO_B:
2552 return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB);
2553 case AArch64::LD1_MXIPXX_V_PSEUDO_H:
2554 return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB);
2555 case AArch64::LD1_MXIPXX_V_PSEUDO_S:
2556 return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB);
2557 case AArch64::LD1_MXIPXX_V_PSEUDO_D:
2558 return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB);
2559 case AArch64::LD1_MXIPXX_V_PSEUDO_Q:
2560 return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB);
2561 case AArch64::LDR_ZA_PSEUDO:
2562 return EmitFill(MI, BB);
2563 case AArch64::BFMOPA_MPPZZ_PSEUDO:
2564 return EmitMopa(AArch64::BFMOPA_MPPZZ, AArch64::ZAS0, MI, BB);
2565 case AArch64::BFMOPS_MPPZZ_PSEUDO:
2566 return EmitMopa(AArch64::BFMOPS_MPPZZ, AArch64::ZAS0, MI, BB);
2567 case AArch64::FMOPAL_MPPZZ_PSEUDO:
2568 return EmitMopa(AArch64::FMOPAL_MPPZZ, AArch64::ZAS0, MI, BB);
2569 case AArch64::FMOPSL_MPPZZ_PSEUDO:
2570 return EmitMopa(AArch64::FMOPSL_MPPZZ, AArch64::ZAS0, MI, BB);
2571 case AArch64::FMOPA_MPPZZ_S_PSEUDO:
2572 return EmitMopa(AArch64::FMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2573 case AArch64::FMOPS_MPPZZ_S_PSEUDO:
2574 return EmitMopa(AArch64::FMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2575 case AArch64::FMOPA_MPPZZ_D_PSEUDO:
2576 return EmitMopa(AArch64::FMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2577 case AArch64::FMOPS_MPPZZ_D_PSEUDO:
2578 return EmitMopa(AArch64::FMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2579 case AArch64::SMOPA_MPPZZ_S_PSEUDO:
2580 return EmitMopa(AArch64::SMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2581 case AArch64::SMOPS_MPPZZ_S_PSEUDO:
2582 return EmitMopa(AArch64::SMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2583 case AArch64::UMOPA_MPPZZ_S_PSEUDO:
2584 return EmitMopa(AArch64::UMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2585 case AArch64::UMOPS_MPPZZ_S_PSEUDO:
2586 return EmitMopa(AArch64::UMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2587 case AArch64::SUMOPA_MPPZZ_S_PSEUDO:
2588 return EmitMopa(AArch64::SUMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2589 case AArch64::SUMOPS_MPPZZ_S_PSEUDO:
2590 return EmitMopa(AArch64::SUMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2591 case AArch64::USMOPA_MPPZZ_S_PSEUDO:
2592 return EmitMopa(AArch64::USMOPA_MPPZZ_S, AArch64::ZAS0, MI, BB);
2593 case AArch64::USMOPS_MPPZZ_S_PSEUDO:
2594 return EmitMopa(AArch64::USMOPS_MPPZZ_S, AArch64::ZAS0, MI, BB);
2595 case AArch64::SMOPA_MPPZZ_D_PSEUDO:
2596 return EmitMopa(AArch64::SMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2597 case AArch64::SMOPS_MPPZZ_D_PSEUDO:
2598 return EmitMopa(AArch64::SMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2599 case AArch64::UMOPA_MPPZZ_D_PSEUDO:
2600 return EmitMopa(AArch64::UMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2601 case AArch64::UMOPS_MPPZZ_D_PSEUDO:
2602 return EmitMopa(AArch64::UMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2603 case AArch64::SUMOPA_MPPZZ_D_PSEUDO:
2604 return EmitMopa(AArch64::SUMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2605 case AArch64::SUMOPS_MPPZZ_D_PSEUDO:
2606 return EmitMopa(AArch64::SUMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2607 case AArch64::USMOPA_MPPZZ_D_PSEUDO:
2608 return EmitMopa(AArch64::USMOPA_MPPZZ_D, AArch64::ZAD0, MI, BB);
2609 case AArch64::USMOPS_MPPZZ_D_PSEUDO:
2610 return EmitMopa(AArch64::USMOPS_MPPZZ_D, AArch64::ZAD0, MI, BB);
2611 case AArch64::INSERT_MXIPZ_H_PSEUDO_B:
2612 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI,
2614 case AArch64::INSERT_MXIPZ_H_PSEUDO_H:
2615 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI,
2617 case AArch64::INSERT_MXIPZ_H_PSEUDO_S:
2618 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI,
2620 case AArch64::INSERT_MXIPZ_H_PSEUDO_D:
2621 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI,
2623 case AArch64::INSERT_MXIPZ_H_PSEUDO_Q:
2624 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI,
2626 case AArch64::INSERT_MXIPZ_V_PSEUDO_B:
2627 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI,
2629 case AArch64::INSERT_MXIPZ_V_PSEUDO_H:
2630 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI,
2632 case AArch64::INSERT_MXIPZ_V_PSEUDO_S:
2633 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI,
2635 case AArch64::INSERT_MXIPZ_V_PSEUDO_D:
2636 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI,
2638 case AArch64::INSERT_MXIPZ_V_PSEUDO_Q:
2639 return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI,
2641 case AArch64::ZERO_M_PSEUDO:
2642 return EmitZero(MI, BB);
2643 case AArch64::ADDHA_MPPZ_PSEUDO_S:
2644 return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_S, AArch64::ZAS0, MI, BB);
2645 case AArch64::ADDVA_MPPZ_PSEUDO_S:
2646 return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_S, AArch64::ZAS0, MI, BB);
2647 case AArch64::ADDHA_MPPZ_PSEUDO_D:
2648 return EmitAddVectorToTile(AArch64::ADDHA_MPPZ_D, AArch64::ZAD0, MI, BB);
2649 case AArch64::ADDVA_MPPZ_PSEUDO_D:
2650 return EmitAddVectorToTile(AArch64::ADDVA_MPPZ_D, AArch64::ZAD0, MI, BB);
2654 //===----------------------------------------------------------------------===//
2655 // AArch64 Lowering private implementation.
2656 //===----------------------------------------------------------------------===//
2658 //===----------------------------------------------------------------------===//
2660 //===----------------------------------------------------------------------===//
2662 // Forward declarations of SVE fixed length lowering helpers
2663 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
2664 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2665 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
2666 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
2668 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
2671 /// isZerosVector - Check whether SDNode N is a zero-filled vector.
2672 static bool isZerosVector(const SDNode *N) {
2673 // Look through a bit convert.
2674 while (N->getOpcode() == ISD::BITCAST)
2675 N = N->getOperand(0).getNode();
2677 if (ISD::isConstantSplatVectorAllZeros(N))
2680 if (N->getOpcode() != AArch64ISD::DUP)
2683 auto Opnd0 = N->getOperand(0);
2684 auto *CINT = dyn_cast<ConstantSDNode>(Opnd0);
2685 auto *CFP = dyn_cast<ConstantFPSDNode>(Opnd0);
2686 return (CINT && CINT->isZero()) || (CFP && CFP->isZero());
2689 /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
2691 static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
2694 llvm_unreachable("Unknown condition code!");
2696 return AArch64CC::NE;
2698 return AArch64CC::EQ;
2700 return AArch64CC::GT;
2702 return AArch64CC::GE;
2704 return AArch64CC::LT;
2706 return AArch64CC::LE;
2708 return AArch64CC::HI;
2710 return AArch64CC::HS;
2712 return AArch64CC::LO;
2714 return AArch64CC::LS;
2718 /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
2719 static void changeFPCCToAArch64CC(ISD::CondCode CC,
2720 AArch64CC::CondCode &CondCode,
2721 AArch64CC::CondCode &CondCode2) {
2722 CondCode2 = AArch64CC::AL;
2725 llvm_unreachable("Unknown FP condition!");
2728 CondCode = AArch64CC::EQ;
2732 CondCode = AArch64CC::GT;
2736 CondCode = AArch64CC::GE;
2739 CondCode = AArch64CC::MI;
2742 CondCode = AArch64CC::LS;
2745 CondCode = AArch64CC::MI;
2746 CondCode2 = AArch64CC::GT;
2749 CondCode = AArch64CC::VC;
2752 CondCode = AArch64CC::VS;
2755 CondCode = AArch64CC::EQ;
2756 CondCode2 = AArch64CC::VS;
2759 CondCode = AArch64CC::HI;
2762 CondCode = AArch64CC::PL;
2766 CondCode = AArch64CC::LT;
2770 CondCode = AArch64CC::LE;
2774 CondCode = AArch64CC::NE;
2779 /// Convert a DAG fp condition code to an AArch64 CC.
2780 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
2781 /// should be AND'ed instead of OR'ed.
2782 static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
2783 AArch64CC::CondCode &CondCode,
2784 AArch64CC::CondCode &CondCode2) {
2785 CondCode2 = AArch64CC::AL;
2788 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2789 assert(CondCode2 == AArch64CC::AL);
2793 // == ((a olt b) || (a ogt b))
2794 // == ((a ord b) && (a une b))
2795 CondCode = AArch64CC::VC;
2796 CondCode2 = AArch64CC::NE;
2800 // == ((a uno b) || (a oeq b))
2801 // == ((a ule b) && (a uge b))
2802 CondCode = AArch64CC::PL;
2803 CondCode2 = AArch64CC::LE;
2808 /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
2809 /// CC usable with the vector instructions. Fewer operations are available
2810 /// without a real NZCV register, so we have to use less efficient combinations
2811 /// to get the same effect.
2812 static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
2813 AArch64CC::CondCode &CondCode,
2814 AArch64CC::CondCode &CondCode2,
2819 // Mostly the scalar mappings work fine.
2820 changeFPCCToAArch64CC(CC, CondCode, CondCode2);
2826 CondCode = AArch64CC::MI;
2827 CondCode2 = AArch64CC::GE;
2834 // All of the compare-mask comparisons are ordered, but we can switch
2835 // between the two by a double inversion. E.g. ULE == !OGT.
2837 changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
2838 CondCode, CondCode2);
2843 static bool isLegalArithImmed(uint64_t C) {
2844 // Matches AArch64DAGToDAGISel::SelectArithImmed().
2845 bool IsLegal = (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
2846 LLVM_DEBUG(dbgs() << "Is imm " << C
2847 << " legal: " << (IsLegal ? "yes\n" : "no\n"));
2851 // Can a (CMP op1, (sub 0, op2) be turned into a CMN instruction on
2852 // the grounds that "op1 - (-op2) == op1 + op2" ? Not always, the C and V flags
2853 // can be set differently by this operation. It comes down to whether
2854 // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
2855 // everything is fine. If not then the optimization is wrong. Thus general
2856 // comparisons are only valid if op2 != 0.
2858 // So, finally, the only LLVM-native comparisons that don't mention C and V
2859 // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
2860 // the absence of information about op2.
2861 static bool isCMN(SDValue Op, ISD::CondCode CC) {
2862 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
2863 (CC == ISD::SETEQ || CC == ISD::SETNE);
2866 static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
2867 SelectionDAG &DAG, SDValue Chain,
2869 EVT VT = LHS.getValueType();
2870 assert(VT != MVT::f128);
2872 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2874 if (VT == MVT::f16 && !FullFP16) {
2875 LHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
2877 RHS = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
2878 {LHS.getValue(1), RHS});
2879 Chain = RHS.getValue(1);
2883 IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
2884 return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
2887 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
2888 const SDLoc &dl, SelectionDAG &DAG) {
2889 EVT VT = LHS.getValueType();
2890 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
2892 if (VT.isFloatingPoint()) {
2893 assert(VT != MVT::f128);
2894 if (VT == MVT::f16 && !FullFP16) {
2895 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
2896 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
2899 return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
2902 // The CMP instruction is just an alias for SUBS, and representing it as
2903 // SUBS means that it's possible to get CSE with subtract operations.
2904 // A later phase can perform the optimization of setting the destination
2905 // register to WZR/XZR if it ends up being unused.
2906 unsigned Opcode = AArch64ISD::SUBS;
2908 if (isCMN(RHS, CC)) {
2909 // Can we combine a (CMP op1, (sub 0, op2) into a CMN instruction ?
2910 Opcode = AArch64ISD::ADDS;
2911 RHS = RHS.getOperand(1);
2912 } else if (isCMN(LHS, CC)) {
2913 // As we are looking for EQ/NE compares, the operands can be commuted ; can
2914 // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
2915 Opcode = AArch64ISD::ADDS;
2916 LHS = LHS.getOperand(1);
2917 } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
2918 if (LHS.getOpcode() == ISD::AND) {
2919 // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
2920 // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
2921 // of the signed comparisons.
2922 const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
2923 DAG.getVTList(VT, MVT_CC),
2926 // Replace all users of (and X, Y) with newly generated (ands X, Y)
2927 DAG.ReplaceAllUsesWith(LHS, ANDSNode);
2928 return ANDSNode.getValue(1);
2929 } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
2930 // Use result of ANDS
2931 return LHS.getValue(1);
2935 return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
2939 /// \defgroup AArch64CCMP CMP;CCMP matching
2941 /// These functions deal with the formation of CMP;CCMP;... sequences.
2942 /// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
2943 /// a comparison. They set the NZCV flags to a predefined value if their
2944 /// predicate is false. This allows to express arbitrary conjunctions, for
2945 /// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B)))"
2948 /// ccmp B, inv(CB), CA
2949 /// check for CB flags
2951 /// This naturally lets us implement chains of AND operations with SETCC
2952 /// operands. And we can even implement some other situations by transforming
2954 /// - We can implement (NEG SETCC) i.e. negating a single comparison by
2955 /// negating the flags used in a CCMP/FCCMP operations.
2956 /// - We can negate the result of a whole chain of CMP/CCMP/FCCMP operations
2957 /// by negating the flags we test for afterwards. i.e.
2958 /// NEG (CMP CCMP CCCMP ...) can be implemented.
2959 /// - Note that we can only ever negate all previously processed results.
2960 /// What we can not implement by flipping the flags to test is a negation
2961 /// of two sub-trees (because the negation affects all sub-trees emitted so
2962 /// far, so the 2nd sub-tree we emit would also affect the first).
2963 /// With those tools we can implement some OR operations:
2964 /// - (OR (SETCC A) (SETCC B)) can be implemented via:
2965 /// NEG (AND (NEG (SETCC A)) (NEG (SETCC B)))
2966 /// - After transforming OR to NEG/AND combinations we may be able to use NEG
2967 /// elimination rules from earlier to implement the whole thing as a
2968 /// CCMP/FCCMP chain.
2970 /// As complete example:
2971 /// or (or (setCA (cmp A)) (setCB (cmp B)))
2972 /// (and (setCC (cmp C)) (setCD (cmp D)))"
2973 /// can be reassociated to:
2974 /// or (and (setCC (cmp C)) setCD (cmp D))
2975 // (or (setCA (cmp A)) (setCB (cmp B)))
2976 /// can be transformed to:
2977 /// not (and (not (and (setCC (cmp C)) (setCD (cmp D))))
2978 /// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
2979 /// which can be implemented as:
2981 /// ccmp D, inv(CD), CC
2982 /// ccmp A, CA, inv(CD)
2983 /// ccmp B, CB, inv(CA)
2984 /// check for CB flags
2986 /// A counterexample is "or (and A B) (and C D)" which translates to
2987 /// not (and (not (and (not A) (not B))) (not (and (not C) (not D)))), we
2988 /// can only implement 1 of the inner (not) operations, but not both!
2991 /// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
2992 static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
2993 ISD::CondCode CC, SDValue CCOp,
2994 AArch64CC::CondCode Predicate,
2995 AArch64CC::CondCode OutCC,
2996 const SDLoc &DL, SelectionDAG &DAG) {
2997 unsigned Opcode = 0;
2998 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
3000 if (LHS.getValueType().isFloatingPoint()) {
3001 assert(LHS.getValueType() != MVT::f128);
3002 if (LHS.getValueType() == MVT::f16 && !FullFP16) {
3003 LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
3004 RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
3006 Opcode = AArch64ISD::FCCMP;
3007 } else if (RHS.getOpcode() == ISD::SUB) {
3008 SDValue SubOp0 = RHS.getOperand(0);
3009 if (isNullConstant(SubOp0) && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
3010 // See emitComparison() on why we can only do this for SETEQ and SETNE.
3011 Opcode = AArch64ISD::CCMN;
3012 RHS = RHS.getOperand(1);
3016 Opcode = AArch64ISD::CCMP;
3018 SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
3019 AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
3020 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
3021 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
3022 return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
3025 /// Returns true if @p Val is a tree of AND/OR/SETCC operations that can be
3026 /// expressed as a conjunction. See \ref AArch64CCMP.
3027 /// \param CanNegate Set to true if we can negate the whole sub-tree just by
3028 /// changing the conditions on the SETCC tests.
3029 /// (this means we can call emitConjunctionRec() with
3030 /// Negate==true on this sub-tree)
3031 /// \param MustBeFirst Set to true if this subtree needs to be negated and we
3032 /// cannot do the negation naturally. We are required to
3033 /// emit the subtree first in this case.
3034 /// \param WillNegate Is true if are called when the result of this
3035 /// subexpression must be negated. This happens when the
3036 /// outer expression is an OR. We can use this fact to know
3037 /// that we have a double negation (or (or ...) ...) that
3038 /// can be implemented for free.
3039 static bool canEmitConjunction(const SDValue Val, bool &CanNegate,
3040 bool &MustBeFirst, bool WillNegate,
3041 unsigned Depth = 0) {
3042 if (!Val.hasOneUse())
3044 unsigned Opcode = Val->getOpcode();
3045 if (Opcode == ISD::SETCC) {
3046 if (Val->getOperand(0).getValueType() == MVT::f128)
3049 MustBeFirst = false;
3052 // Protect against exponential runtime and stack overflow.
3055 if (Opcode == ISD::AND || Opcode == ISD::OR) {
3056 bool IsOR = Opcode == ISD::OR;
3057 SDValue O0 = Val->getOperand(0);
3058 SDValue O1 = Val->getOperand(1);
3061 if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, Depth+1))
3065 if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, Depth+1))
3068 if (MustBeFirstL && MustBeFirstR)
3072 // For an OR expression we need to be able to naturally negate at least
3073 // one side or we cannot do the transformation at all.
3074 if (!CanNegateL && !CanNegateR)
3076 // If we the result of the OR will be negated and we can naturally negate
3077 // the leafs, then this sub-tree as a whole negates naturally.
3078 CanNegate = WillNegate && CanNegateL && CanNegateR;
3079 // If we cannot naturally negate the whole sub-tree, then this must be
3081 MustBeFirst = !CanNegate;
3083 assert(Opcode == ISD::AND && "Must be OR or AND");
3084 // We cannot naturally negate an AND operation.
3086 MustBeFirst = MustBeFirstL || MustBeFirstR;
3093 /// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
3094 /// of CCMP/CFCMP ops. See @ref AArch64CCMP.
3095 /// Tries to transform the given i1 producing node @p Val to a series compare
3096 /// and conditional compare operations. @returns an NZCV flags producing node
3097 /// and sets @p OutCC to the flags that should be tested or returns SDValue() if
3098 /// transformation was not possible.
3099 /// \p Negate is true if we want this sub-tree being negated just by changing
3100 /// SETCC conditions.
3101 static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
3102 AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
3103 AArch64CC::CondCode Predicate) {
3104 // We're at a tree leaf, produce a conditional comparison operation.
3105 unsigned Opcode = Val->getOpcode();
3106 if (Opcode == ISD::SETCC) {
3107 SDValue LHS = Val->getOperand(0);
3108 SDValue RHS = Val->getOperand(1);
3109 ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
3110 bool isInteger = LHS.getValueType().isInteger();
3112 CC = getSetCCInverse(CC, LHS.getValueType());
3114 // Determine OutCC and handle FP special case.
3116 OutCC = changeIntCCToAArch64CC(CC);
3118 assert(LHS.getValueType().isFloatingPoint());
3119 AArch64CC::CondCode ExtraCC;
3120 changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
3121 // Some floating point conditions can't be tested with a single condition
3122 // code. Construct an additional comparison in this case.
3123 if (ExtraCC != AArch64CC::AL) {
3125 if (!CCOp.getNode())
3126 ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
3128 ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
3131 Predicate = ExtraCC;
3135 // Produce a normal comparison if we are first in the chain
3137 return emitComparison(LHS, RHS, CC, DL, DAG);
3138 // Otherwise produce a ccmp.
3139 return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
3142 assert(Val->hasOneUse() && "Valid conjunction/disjunction tree");
3144 bool IsOR = Opcode == ISD::OR;
3146 SDValue LHS = Val->getOperand(0);
3149 bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR);
3150 assert(ValidL && "Valid conjunction/disjunction tree");
3153 SDValue RHS = Val->getOperand(1);
3156 bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR);
3157 assert(ValidR && "Valid conjunction/disjunction tree");
3160 // Swap sub-tree that must come first to the right side.
3162 assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
3163 std::swap(LHS, RHS);
3164 std::swap(CanNegateL, CanNegateR);
3165 std::swap(MustBeFirstL, MustBeFirstR);
3171 bool NegateAfterAll;
3172 if (Opcode == ISD::OR) {
3173 // Swap the sub-tree that we can negate naturally to the left.
3175 assert(CanNegateR && "at least one side must be negatable");
3176 assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
3178 std::swap(LHS, RHS);
3180 NegateAfterR = true;
3182 // Negate the left sub-tree if possible, otherwise negate the result.
3183 NegateR = CanNegateR;
3184 NegateAfterR = !CanNegateR;
3187 NegateAfterAll = !Negate;
3189 assert(Opcode == ISD::AND && "Valid conjunction/disjunction tree");
3190 assert(!Negate && "Valid conjunction/disjunction tree");
3194 NegateAfterR = false;
3195 NegateAfterAll = false;
3199 AArch64CC::CondCode RHSCC;
3200 SDValue CmpR = emitConjunctionRec(DAG, RHS, RHSCC, NegateR, CCOp, Predicate);
3202 RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
3203 SDValue CmpL = emitConjunctionRec(DAG, LHS, OutCC, NegateL, CmpR, RHSCC);
3205 OutCC = AArch64CC::getInvertedCondCode(OutCC);
3209 /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
3210 /// In some cases this is even possible with OR operations in the expression.
3211 /// See \ref AArch64CCMP.
3212 /// \see emitConjunctionRec().
3213 static SDValue emitConjunction(SelectionDAG &DAG, SDValue Val,
3214 AArch64CC::CondCode &OutCC) {
3215 bool DummyCanNegate;
3216 bool DummyMustBeFirst;
3217 if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false))
3220 return emitConjunctionRec(DAG, Val, OutCC, false, SDValue(), AArch64CC::AL);
3225 /// Returns how profitable it is to fold a comparison's operand's shift and/or
3226 /// extension operations.
3227 static unsigned getCmpOperandFoldingProfit(SDValue Op) {
3228 auto isSupportedExtend = [&](SDValue V) {
3229 if (V.getOpcode() == ISD::SIGN_EXTEND_INREG)
3232 if (V.getOpcode() == ISD::AND)
3233 if (ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(V.getOperand(1))) {
3234 uint64_t Mask = MaskCst->getZExtValue();
3235 return (Mask == 0xFF || Mask == 0xFFFF || Mask == 0xFFFFFFFF);
3241 if (!Op.hasOneUse())
3244 if (isSupportedExtend(Op))
3247 unsigned Opc = Op.getOpcode();
3248 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
3249 if (ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
3250 uint64_t Shift = ShiftCst->getZExtValue();
3251 if (isSupportedExtend(Op.getOperand(0)))
3252 return (Shift <= 4) ? 2 : 1;
3253 EVT VT = Op.getValueType();
3254 if ((VT == MVT::i32 && Shift <= 31) || (VT == MVT::i64 && Shift <= 63))
3261 static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
3262 SDValue &AArch64cc, SelectionDAG &DAG,
3264 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
3265 EVT VT = RHS.getValueType();
3266 uint64_t C = RHSC->getZExtValue();
3267 if (!isLegalArithImmed(C)) {
3268 // Constant does not fit, try adjusting it by one?
3274 if ((VT == MVT::i32 && C != 0x80000000 &&
3275 isLegalArithImmed((uint32_t)(C - 1))) ||
3276 (VT == MVT::i64 && C != 0x80000000ULL &&
3277 isLegalArithImmed(C - 1ULL))) {
3278 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
3279 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3280 RHS = DAG.getConstant(C, dl, VT);
3285 if ((VT == MVT::i32 && C != 0 &&
3286 isLegalArithImmed((uint32_t)(C - 1))) ||
3287 (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
3288 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
3289 C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
3290 RHS = DAG.getConstant(C, dl, VT);
3295 if ((VT == MVT::i32 && C != INT32_MAX &&
3296 isLegalArithImmed((uint32_t)(C + 1))) ||
3297 (VT == MVT::i64 && C != INT64_MAX &&
3298 isLegalArithImmed(C + 1ULL))) {
3299 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
3300 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3301 RHS = DAG.getConstant(C, dl, VT);
3306 if ((VT == MVT::i32 && C != UINT32_MAX &&
3307 isLegalArithImmed((uint32_t)(C + 1))) ||
3308 (VT == MVT::i64 && C != UINT64_MAX &&
3309 isLegalArithImmed(C + 1ULL))) {
3310 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
3311 C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
3312 RHS = DAG.getConstant(C, dl, VT);
3319 // Comparisons are canonicalized so that the RHS operand is simpler than the
3320 // LHS one, the extreme case being when RHS is an immediate. However, AArch64
3321 // can fold some shift+extend operations on the RHS operand, so swap the
3322 // operands if that can be done.
3327 // can be turned into:
3328 // cmp w12, w11, lsl #1
3329 if (!isa<ConstantSDNode>(RHS) ||
3330 !isLegalArithImmed(cast<ConstantSDNode>(RHS)->getZExtValue())) {
3331 SDValue TheLHS = isCMN(LHS, CC) ? LHS.getOperand(1) : LHS;
3333 if (getCmpOperandFoldingProfit(TheLHS) > getCmpOperandFoldingProfit(RHS)) {
3334 std::swap(LHS, RHS);
3335 CC = ISD::getSetCCSwappedOperands(CC);
3340 AArch64CC::CondCode AArch64CC;
3341 if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
3342 const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);
3344 // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
3345 // For the i8 operand, the largest immediate is 255, so this can be easily
3346 // encoded in the compare instruction. For the i16 operand, however, the
3347 // largest immediate cannot be encoded in the compare.
3348 // Therefore, use a sign extending load and cmn to avoid materializing the
3349 // -1 constant. For example,
3351 // ldrh w0, [x0, #0]
3354 // ldrsh w0, [x0, #0]
3356 // Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
3357 // if and only if (sext LHS) == (sext RHS). The checks are in place to
3358 // ensure both the LHS and RHS are truly zero extended and to make sure the
3359 // transformation is profitable.
3360 if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
3361 cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
3362 cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
3363 LHS.getNode()->hasNUsesOfValue(1, 0)) {
3364 int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
3365 if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
3367 DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
3368 DAG.getValueType(MVT::i16));
3369 Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
3370 RHS.getValueType()),
3372 AArch64CC = changeIntCCToAArch64CC(CC);
3376 if (!Cmp && (RHSC->isZero() || RHSC->isOne())) {
3377 if ((Cmp = emitConjunction(DAG, LHS, AArch64CC))) {
3378 if ((CC == ISD::SETNE) ^ RHSC->isZero())
3379 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
3385 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
3386 AArch64CC = changeIntCCToAArch64CC(CC);
3388 AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
3392 static std::pair<SDValue, SDValue>
3393 getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
3394 assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
3395 "Unsupported value type");
3396 SDValue Value, Overflow;
3398 SDValue LHS = Op.getOperand(0);
3399 SDValue RHS = Op.getOperand(1);
3401 switch (Op.getOpcode()) {
3403 llvm_unreachable("Unknown overflow instruction!");
3405 Opc = AArch64ISD::ADDS;
3409 Opc = AArch64ISD::ADDS;
3413 Opc = AArch64ISD::SUBS;
3417 Opc = AArch64ISD::SUBS;
3420 // Multiply needs a little bit extra work.
3424 bool IsSigned = Op.getOpcode() == ISD::SMULO;
3425 if (Op.getValueType() == MVT::i32) {
3426 // Extend to 64-bits, then perform a 64-bit multiply.
3427 unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3428 LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
3429 RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
3430 SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3431 Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mul);
3433 // Check that the result fits into a 32-bit integer.
3434 SDVTList VTs = DAG.getVTList(MVT::i64, MVT_CC);
3436 // cmp xreg, wreg, sxtw
3437 SDValue SExtMul = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Value);
3439 DAG.getNode(AArch64ISD::SUBS, DL, VTs, Mul, SExtMul).getValue(1);
3441 // tst xreg, #0xffffffff00000000
3442 SDValue UpperBits = DAG.getConstant(0xFFFFFFFF00000000, DL, MVT::i64);
3444 DAG.getNode(AArch64ISD::ANDS, DL, VTs, Mul, UpperBits).getValue(1);
3448 assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
3449 // For the 64 bit multiply
3450 Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
3452 SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
3453 SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
3454 DAG.getConstant(63, DL, MVT::i64));
3455 // It is important that LowerBits is last, otherwise the arithmetic
3456 // shift will not be folded into the compare (SUBS).
3457 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3458 Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
3461 SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
3462 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
3464 DAG.getNode(AArch64ISD::SUBS, DL, VTs,
3465 DAG.getConstant(0, DL, MVT::i64),
3466 UpperBits).getValue(1);
3473 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
3475 // Emit the AArch64 operation with overflow check.
3476 Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
3477 Overflow = Value.getValue(1);
3479 return std::make_pair(Value, Overflow);
3482 SDValue AArch64TargetLowering::LowerXOR(SDValue Op, SelectionDAG &DAG) const {
3483 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
3484 return LowerToScalableOp(Op, DAG);
3486 SDValue Sel = Op.getOperand(0);
3487 SDValue Other = Op.getOperand(1);
3490 // If the operand is an overflow checking operation, invert the condition
3491 // code and kill the Not operation. I.e., transform:
3492 // (xor (overflow_op_bool, 1))
3494 // (csel 1, 0, invert(cc), overflow_op_bool)
3495 // ... which later gets transformed to just a cset instruction with an
3496 // inverted condition code, rather than a cset + eor sequence.
3497 if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
3498 // Only lower legal XALUO ops.
3499 if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
3502 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3503 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3504 AArch64CC::CondCode CC;
3505 SDValue Value, Overflow;
3506 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Sel.getValue(0), DAG);
3507 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3508 return DAG.getNode(AArch64ISD::CSEL, dl, Op.getValueType(), TVal, FVal,
3511 // If neither operand is a SELECT_CC, give up.
3512 if (Sel.getOpcode() != ISD::SELECT_CC)
3513 std::swap(Sel, Other);
3514 if (Sel.getOpcode() != ISD::SELECT_CC)
3517 // The folding we want to perform is:
3518 // (xor x, (select_cc a, b, cc, 0, -1) )
3520 // (csel x, (xor x, -1), cc ...)
3522 // The latter will get matched to a CSINV instruction.
3524 ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
3525 SDValue LHS = Sel.getOperand(0);
3526 SDValue RHS = Sel.getOperand(1);
3527 SDValue TVal = Sel.getOperand(2);
3528 SDValue FVal = Sel.getOperand(3);
3530 // FIXME: This could be generalized to non-integer comparisons.
3531 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
3534 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
3535 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
3537 // The values aren't constants, this isn't the pattern we're looking for.
3538 if (!CFVal || !CTVal)
3541 // We can commute the SELECT_CC by inverting the condition. This
3542 // might be needed to make this fit into a CSINV pattern.
3543 if (CTVal->isAllOnes() && CFVal->isZero()) {
3544 std::swap(TVal, FVal);
3545 std::swap(CTVal, CFVal);
3546 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
3549 // If the constants line up, perform the transform!
3550 if (CTVal->isZero() && CFVal->isAllOnes()) {
3552 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
3555 TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
3556 DAG.getConstant(-1ULL, dl, Other.getValueType()));
3558 return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
3565 // If Invert is false, sets 'C' bit of NZCV to 0 if value is 0, else sets 'C'
3566 // bit to 1. If Invert is true, sets 'C' bit of NZCV to 1 if value is 0, else
3567 // sets 'C' bit to 0.
3568 static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert) {
3570 EVT VT = Value.getValueType();
3571 SDValue Op0 = Invert ? DAG.getConstant(0, DL, VT) : Value;
3572 SDValue Op1 = Invert ? Value : DAG.getConstant(1, DL, VT);
3574 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::Glue), Op0, Op1);
3575 return Cmp.getValue(1);
3578 // If Invert is false, value is 1 if 'C' bit of NZCV is 1, else 0.
3579 // If Invert is true, value is 0 if 'C' bit of NZCV is 1, else 1.
3580 static SDValue carryFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG,
3582 assert(Flag.getResNo() == 1);
3584 SDValue Zero = DAG.getConstant(0, DL, VT);
3585 SDValue One = DAG.getConstant(1, DL, VT);
3586 unsigned Cond = Invert ? AArch64CC::LO : AArch64CC::HS;
3587 SDValue CC = DAG.getConstant(Cond, DL, MVT::i32);
3588 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3591 // Value is 1 if 'V' bit of NZCV is 1, else 0
3592 static SDValue overflowFlagToValue(SDValue Flag, EVT VT, SelectionDAG &DAG) {
3593 assert(Flag.getResNo() == 1);
3595 SDValue Zero = DAG.getConstant(0, DL, VT);
3596 SDValue One = DAG.getConstant(1, DL, VT);
3597 SDValue CC = DAG.getConstant(AArch64CC::VS, DL, MVT::i32);
3598 return DAG.getNode(AArch64ISD::CSEL, DL, VT, One, Zero, CC, Flag);
3601 // This lowering is inefficient, but it will get cleaned up by
3602 // `foldOverflowCheck`
3603 static SDValue lowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode,
3605 EVT VT0 = Op.getValue(0).getValueType();
3606 EVT VT1 = Op.getValue(1).getValueType();
3608 if (VT0 != MVT::i32 && VT0 != MVT::i64)
3611 bool InvertCarry = Opcode == AArch64ISD::SBCS;
3612 SDValue OpLHS = Op.getOperand(0);
3613 SDValue OpRHS = Op.getOperand(1);
3614 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
3617 SDVTList VTs = DAG.getVTList(VT0, VT1);
3619 SDValue Sum = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::Glue), OpLHS,
3623 IsSigned ? overflowFlagToValue(Sum.getValue(1), VT1, DAG)
3624 : carryFlagToValue(Sum.getValue(1), VT1, DAG, InvertCarry);
3626 return DAG.getNode(ISD::MERGE_VALUES, DL, VTs, Sum, OutFlag);
3629 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
3630 // Let legalize expand this if it isn't a legal type yet.
3631 if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
3635 AArch64CC::CondCode CC;
3636 // The actual operation that sets the overflow or carry flag.
3637 SDValue Value, Overflow;
3638 std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
3640 // We use 0 and 1 as false and true values.
3641 SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
3642 SDValue FVal = DAG.getConstant(0, dl, MVT::i32);
3644 // We use an inverted condition, because the conditional select is inverted
3645 // too. This will allow it to be selected to a single instruction:
3646 // CSINC Wd, WZR, WZR, invert(cond).
3647 SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
3648 Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
3651 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
3652 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
3655 // Prefetch operands are:
3656 // 1: Address to prefetch
3658 // 3: int locality (0 = no locality ... 3 = extreme locality)
3659 // 4: bool isDataCache
3660 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
3662 unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
3663 unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
3664 unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
3666 bool IsStream = !Locality;
3667 // When the locality number is set
3669 // The front-end should have filtered out the out-of-range values
3670 assert(Locality <= 3 && "Prefetch locality out-of-range");
3671 // The locality degree is the opposite of the cache speed.
3672 // Put the number the other way around.
3673 // The encoding starts at 0 for level 1
3674 Locality = 3 - Locality;
3677 // built the mask value encoding the expected behavior.
3678 unsigned PrfOp = (IsWrite << 4) | // Load/Store bit
3679 (!IsData << 3) | // IsDataCache bit
3680 (Locality << 1) | // Cache level bits
3681 (unsigned)IsStream; // Stream bit
3682 return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
3683 DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
3686 SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
3687 SelectionDAG &DAG) const {
3688 EVT VT = Op.getValueType();
3689 if (VT.isScalableVector())
3690 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_EXTEND_MERGE_PASSTHRU);
3692 if (useSVEForFixedLengthVectorVT(VT))
3693 return LowerFixedLengthFPExtendToSVE(Op, DAG);
3695 assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
3699 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
3700 SelectionDAG &DAG) const {
3701 if (Op.getValueType().isScalableVector())
3702 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
3704 bool IsStrict = Op->isStrictFPOpcode();
3705 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3706 EVT SrcVT = SrcVal.getValueType();
3708 if (useSVEForFixedLengthVectorVT(SrcVT))
3709 return LowerFixedLengthFPRoundToSVE(Op, DAG);
3711 if (SrcVT != MVT::f128) {
3712 // Expand cases where the input is a vector bigger than NEON.
3713 if (useSVEForFixedLengthVectorVT(SrcVT))
3716 // It's legal except when f128 is involved
3723 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
3724 SelectionDAG &DAG) const {
3725 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3726 // Any additional optimization in this function should be recorded
3727 // in the cost tables.
3728 bool IsStrict = Op->isStrictFPOpcode();
3729 EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
3730 EVT VT = Op.getValueType();
3732 if (VT.isScalableVector()) {
3733 unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
3734 ? AArch64ISD::FCVTZU_MERGE_PASSTHRU
3735 : AArch64ISD::FCVTZS_MERGE_PASSTHRU;
3736 return LowerToPredicatedOp(Op, DAG, Opcode);
3739 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3740 return LowerFixedLengthFPToIntToSVE(Op, DAG);
3742 unsigned NumElts = InVT.getVectorNumElements();
3744 // f16 conversions are promoted to f32 when full fp16 is not supported.
3745 if (InVT.getVectorElementType() == MVT::f16 &&
3746 !Subtarget->hasFullFP16()) {
3747 MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
3750 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other},
3751 {Op.getOperand(0), Op.getOperand(1)});
3752 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3753 {Ext.getValue(1), Ext.getValue(0)});
3756 Op.getOpcode(), dl, Op.getValueType(),
3757 DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
3760 uint64_t VTSize = VT.getFixedSizeInBits();
3761 uint64_t InVTSize = InVT.getFixedSizeInBits();
3762 if (VTSize < InVTSize) {
3765 InVT = InVT.changeVectorElementTypeToInteger();
3766 SDValue Cv = DAG.getNode(Op.getOpcode(), dl, {InVT, MVT::Other},
3767 {Op.getOperand(0), Op.getOperand(1)});
3768 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3769 return DAG.getMergeValues({Trunc, Cv.getValue(1)}, dl);
3772 DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
3774 return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
3777 if (VTSize > InVTSize) {
3780 MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
3781 VT.getVectorNumElements());
3783 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {ExtVT, MVT::Other},
3784 {Op.getOperand(0), Op.getOperand(1)});
3785 return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other},
3786 {Ext.getValue(1), Ext.getValue(0)});
3788 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
3789 return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
3792 // Use a scalar operation for conversions between single-element vectors of
3796 SDValue Extract = DAG.getNode(
3797 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
3798 Op.getOperand(IsStrict ? 1 : 0), DAG.getConstant(0, dl, MVT::i64));
3799 EVT ScalarVT = VT.getScalarType();
3801 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
3802 {Op.getOperand(0), Extract});
3803 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
3806 // Type changing conversions are illegal.
3810 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
3811 SelectionDAG &DAG) const {
3812 bool IsStrict = Op->isStrictFPOpcode();
3813 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
3815 if (SrcVal.getValueType().isVector())
3816 return LowerVectorFP_TO_INT(Op, DAG);
3818 // f16 conversions are promoted to f32 when full fp16 is not supported.
3819 if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
3823 DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::f32, MVT::Other},
3824 {Op.getOperand(0), SrcVal});
3825 return DAG.getNode(Op.getOpcode(), dl, {Op.getValueType(), MVT::Other},
3826 {Ext.getValue(1), Ext.getValue(0)});
3829 Op.getOpcode(), dl, Op.getValueType(),
3830 DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
3833 if (SrcVal.getValueType() != MVT::f128) {
3834 // It's legal except when f128 is involved
3842 AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
3843 SelectionDAG &DAG) const {
3844 // AArch64 FP-to-int conversions saturate to the destination element size, so
3845 // we can lower common saturating conversions to simple instructions.
3846 SDValue SrcVal = Op.getOperand(0);
3847 EVT SrcVT = SrcVal.getValueType();
3848 EVT DstVT = Op.getValueType();
3849 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3851 uint64_t SrcElementWidth = SrcVT.getScalarSizeInBits();
3852 uint64_t DstElementWidth = DstVT.getScalarSizeInBits();
3853 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3854 assert(SatWidth <= DstElementWidth &&
3855 "Saturation width cannot exceed result width");
3857 // TODO: Consider lowering to SVE operations, as in LowerVectorFP_TO_INT.
3858 // Currently, the `llvm.fpto[su]i.sat.*` intrinsics don't accept scalable
3859 // types, so this is hard to reach.
3860 if (DstVT.isScalableVector())
3863 EVT SrcElementVT = SrcVT.getVectorElementType();
3865 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3866 if (SrcElementVT == MVT::f16 &&
3867 (!Subtarget->hasFullFP16() || DstElementWidth > 16)) {
3868 MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
3869 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
3871 SrcElementVT = MVT::f32;
3872 SrcElementWidth = 32;
3873 } else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
3874 SrcElementVT != MVT::f16)
3878 // Cases that we can emit directly.
3879 if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
3880 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
3881 DAG.getValueType(DstVT.getScalarType()));
3883 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3884 // result. This is only valid if the legal cvt is larger than the saturate
3885 // width. For double, as we don't have MIN/MAX, it can be simpler to scalarize
3886 // (at least until sqxtn is selected).
3887 if (SrcElementWidth < SatWidth || SrcElementVT == MVT::f64)
3890 EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
3891 SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
3892 DAG.getValueType(IntVT.getScalarType()));
3894 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3895 SDValue MinC = DAG.getConstant(
3896 APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
3897 SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
3898 SDValue MaxC = DAG.getConstant(
3899 APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
3900 Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
3902 SDValue MinC = DAG.getConstant(
3903 APInt::getAllOnesValue(SatWidth).zext(SrcElementWidth), DL, IntVT);
3904 Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
3907 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
3910 SDValue AArch64TargetLowering::LowerFP_TO_INT_SAT(SDValue Op,
3911 SelectionDAG &DAG) const {
3912 // AArch64 FP-to-int conversions saturate to the destination register size, so
3913 // we can lower common saturating conversions to simple instructions.
3914 SDValue SrcVal = Op.getOperand(0);
3915 EVT SrcVT = SrcVal.getValueType();
3917 if (SrcVT.isVector())
3918 return LowerVectorFP_TO_INT_SAT(Op, DAG);
3920 EVT DstVT = Op.getValueType();
3921 EVT SatVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3922 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3923 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3924 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3926 // In the absence of FP16 support, promote f16 to f32 and saturate the result.
3927 if (SrcVT == MVT::f16 && !Subtarget->hasFullFP16()) {
3928 SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, SrcVal);
3930 } else if (SrcVT != MVT::f64 && SrcVT != MVT::f32 && SrcVT != MVT::f16)
3934 // Cases that we can emit directly.
3935 if ((SrcVT == MVT::f64 || SrcVT == MVT::f32 ||
3936 (SrcVT == MVT::f16 && Subtarget->hasFullFP16())) &&
3937 DstVT == SatVT && (DstVT == MVT::i64 || DstVT == MVT::i32))
3938 return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
3939 DAG.getValueType(DstVT));
3941 // Otherwise we emit a cvt that saturates to a higher BW, and saturate the
3942 // result. This is only valid if the legal cvt is larger than the saturate
3944 if (DstWidth < SatWidth)
3948 DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal, DAG.getValueType(DstVT));
3950 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3951 SDValue MinC = DAG.getConstant(
3952 APInt::getSignedMaxValue(SatWidth).sext(DstWidth), DL, DstVT);
3953 SDValue Min = DAG.getNode(ISD::SMIN, DL, DstVT, NativeCvt, MinC);
3954 SDValue MaxC = DAG.getConstant(
3955 APInt::getSignedMinValue(SatWidth).sext(DstWidth), DL, DstVT);
3956 Sat = DAG.getNode(ISD::SMAX, DL, DstVT, Min, MaxC);
3958 SDValue MinC = DAG.getConstant(
3959 APInt::getAllOnesValue(SatWidth).zext(DstWidth), DL, DstVT);
3960 Sat = DAG.getNode(ISD::UMIN, DL, DstVT, NativeCvt, MinC);
3963 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
3966 SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
3967 SelectionDAG &DAG) const {
3968 // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
3969 // Any additional optimization in this function should be recorded
3970 // in the cost tables.
3971 bool IsStrict = Op->isStrictFPOpcode();
3972 EVT VT = Op.getValueType();
3974 SDValue In = Op.getOperand(IsStrict ? 1 : 0);
3975 EVT InVT = In.getValueType();
3976 unsigned Opc = Op.getOpcode();
3977 bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
3979 if (VT.isScalableVector()) {
3980 if (InVT.getVectorElementType() == MVT::i1) {
3981 // We can't directly extend an SVE predicate; extend it first.
3982 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
3983 EVT CastVT = getPromotedVTForPredicate(InVT);
3984 In = DAG.getNode(CastOpc, dl, CastVT, In);
3985 return DAG.getNode(Opc, dl, VT, In);
3988 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
3989 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
3990 return LowerToPredicatedOp(Op, DAG, Opcode);
3993 if (useSVEForFixedLengthVectorVT(VT) || useSVEForFixedLengthVectorVT(InVT))
3994 return LowerFixedLengthIntToFPToSVE(Op, DAG);
3996 uint64_t VTSize = VT.getFixedSizeInBits();
3997 uint64_t InVTSize = InVT.getFixedSizeInBits();
3998 if (VTSize < InVTSize) {
4000 MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
4001 InVT.getVectorNumElements());
4003 In = DAG.getNode(Opc, dl, {CastVT, MVT::Other},
4004 {Op.getOperand(0), In});
4006 ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other},
4007 {In.getValue(1), In.getValue(0), DAG.getIntPtrConstant(0, dl)});
4009 In = DAG.getNode(Opc, dl, CastVT, In);
4010 return DAG.getNode(ISD::FP_ROUND, dl, VT, In,
4011 DAG.getIntPtrConstant(0, dl, /*isTarget=*/true));
4014 if (VTSize > InVTSize) {
4015 unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
4016 EVT CastVT = VT.changeVectorElementTypeToInteger();
4017 In = DAG.getNode(CastOpc, dl, CastVT, In);
4019 return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op.getOperand(0), In});
4020 return DAG.getNode(Opc, dl, VT, In);
4023 // Use a scalar operation for conversions between single-element vectors of
4025 if (VT.getVectorNumElements() == 1) {
4026 SDValue Extract = DAG.getNode(
4027 ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(),
4028 In, DAG.getConstant(0, dl, MVT::i64));
4029 EVT ScalarVT = VT.getScalarType();
4031 return DAG.getNode(Op.getOpcode(), dl, {ScalarVT, MVT::Other},
4032 {Op.getOperand(0), Extract});
4033 return DAG.getNode(Op.getOpcode(), dl, ScalarVT, Extract);
4039 SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
4040 SelectionDAG &DAG) const {
4041 if (Op.getValueType().isVector())
4042 return LowerVectorINT_TO_FP(Op, DAG);
4044 bool IsStrict = Op->isStrictFPOpcode();
4045 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
4047 // f16 conversions are promoted to f32 when full fp16 is not supported.
4048 if (Op.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
4051 SDValue Val = DAG.getNode(Op.getOpcode(), dl, {MVT::f32, MVT::Other},
4052 {Op.getOperand(0), SrcVal});
4054 ISD::STRICT_FP_ROUND, dl, {MVT::f16, MVT::Other},
4055 {Val.getValue(1), Val.getValue(0), DAG.getIntPtrConstant(0, dl)});
4058 ISD::FP_ROUND, dl, MVT::f16,
4059 DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
4060 DAG.getIntPtrConstant(0, dl));
4063 // i128 conversions are libcalls.
4064 if (SrcVal.getValueType() == MVT::i128)
4067 // Other conversions are legal, unless it's to the completely software-based
4069 if (Op.getValueType() != MVT::f128)
4074 SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
4075 SelectionDAG &DAG) const {
4076 // For iOS, we want to call an alternative entry point: __sincos_stret,
4077 // which returns the values in two S / D registers.
4079 SDValue Arg = Op.getOperand(0);
4080 EVT ArgVT = Arg.getValueType();
4081 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
4088 Entry.IsSExt = false;
4089 Entry.IsZExt = false;
4090 Args.push_back(Entry);
4092 RTLIB::Libcall LC = ArgVT == MVT::f64 ? RTLIB::SINCOS_STRET_F64
4093 : RTLIB::SINCOS_STRET_F32;
4094 const char *LibcallName = getLibcallName(LC);
4096 DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));
4098 StructType *RetTy = StructType::get(ArgTy, ArgTy);
4099 TargetLowering::CallLoweringInfo CLI(DAG);
4101 .setChain(DAG.getEntryNode())
4102 .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
4104 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
4105 return CallResult.first;
4108 static MVT getSVEContainerType(EVT ContentTy);
4110 SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op,
4111 SelectionDAG &DAG) const {
4112 EVT OpVT = Op.getValueType();
4113 EVT ArgVT = Op.getOperand(0).getValueType();
4115 if (useSVEForFixedLengthVectorVT(OpVT))
4116 return LowerFixedLengthBitcastToSVE(Op, DAG);
4118 if (OpVT.isScalableVector()) {
4119 // Bitcasting between unpacked vector types of different element counts is
4120 // not a NOP because the live elements are laid out differently.
4122 // e.g. nxv2i32 = XX??XX??
4123 // nxv4f16 = X?X?X?X?
4124 if (OpVT.getVectorElementCount() != ArgVT.getVectorElementCount())
4127 if (isTypeLegal(OpVT) && !isTypeLegal(ArgVT)) {
4128 assert(OpVT.isFloatingPoint() && !ArgVT.isFloatingPoint() &&
4129 "Expected int->fp bitcast!");
4131 DAG.getNode(ISD::ANY_EXTEND, SDLoc(Op), getSVEContainerType(ArgVT),
4133 return getSVESafeBitCast(OpVT, ExtResult, DAG);
4135 return getSVESafeBitCast(OpVT, Op.getOperand(0), DAG);
4138 if (OpVT != MVT::f16 && OpVT != MVT::bf16)
4141 // Bitcasts between f16 and bf16 are legal.
4142 if (ArgVT == MVT::f16 || ArgVT == MVT::bf16)
4145 assert(ArgVT == MVT::i16);
4148 Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
4149 Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
4151 DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
4152 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
4156 static EVT getExtensionTo64Bits(const EVT &OrigVT) {
4157 if (OrigVT.getSizeInBits() >= 64)
4160 assert(OrigVT.isSimple() && "Expecting a simple value type");
4162 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
4163 switch (OrigSimpleTy) {
4164 default: llvm_unreachable("Unexpected Vector Type");
4173 static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
4176 unsigned ExtOpcode) {
4177 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
4178 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
4179 // 64-bits we need to insert a new extension so that it will be 64-bits.
4180 assert(ExtTy.is128BitVector() && "Unexpected extension size");
4181 if (OrigTy.getSizeInBits() >= 64)
4184 // Must extend size to at least 64 bits to be used as an operand for VMULL.
4185 EVT NewVT = getExtensionTo64Bits(OrigTy);
4187 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
4190 static bool isOperandOfHigherHalf(SDValue &Op) {
4191 SDNode *OpNode = Op.getNode();
4192 if (OpNode->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
4195 ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpNode->getOperand(1));
4196 if (!C || C->getZExtValue() != 1)
4199 EVT VT = OpNode->getOperand(0).getValueType();
4201 return VT.isFixedLengthVector() && VT.getVectorNumElements() == 2;
4204 static bool areOperandsOfHigherHalf(SDValue &Op1, SDValue &Op2) {
4205 return isOperandOfHigherHalf(Op1) && isOperandOfHigherHalf(Op2);
4208 static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
4210 EVT VT = N->getValueType(0);
4212 if (N->getOpcode() != ISD::BUILD_VECTOR)
4215 for (const SDValue &Elt : N->op_values()) {
4216 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
4217 unsigned EltSize = VT.getScalarSizeInBits();
4218 unsigned HalfSize = EltSize / 2;
4220 if (!isIntN(HalfSize, C->getSExtValue()))
4223 if (!isUIntN(HalfSize, C->getZExtValue()))
4234 static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
4235 if (N->getOpcode() == ISD::SIGN_EXTEND ||
4236 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
4237 return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
4238 N->getOperand(0)->getValueType(0),
4242 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
4243 EVT VT = N->getValueType(0);
4245 unsigned EltSize = VT.getScalarSizeInBits() / 2;
4246 unsigned NumElts = VT.getVectorNumElements();
4247 MVT TruncVT = MVT::getIntegerVT(EltSize);
4248 SmallVector<SDValue, 8> Ops;
4249 for (unsigned i = 0; i != NumElts; ++i) {
4250 ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
4251 const APInt &CInt = C->getAPIntValue();
4252 // Element types smaller than 32 bits are not legal, so use i32 elements.
4253 // The values are implicitly truncated so sext vs. zext doesn't matter.
4254 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
4256 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
4259 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
4260 return N->getOpcode() == ISD::SIGN_EXTEND ||
4261 N->getOpcode() == ISD::ANY_EXTEND ||
4262 isExtendedBUILD_VECTOR(N, DAG, true);
4265 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
4266 return N->getOpcode() == ISD::ZERO_EXTEND ||
4267 N->getOpcode() == ISD::ANY_EXTEND ||
4268 isExtendedBUILD_VECTOR(N, DAG, false);
4271 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
4272 unsigned Opcode = N->getOpcode();
4273 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4274 SDNode *N0 = N->getOperand(0).getNode();
4275 SDNode *N1 = N->getOperand(1).getNode();
4276 return N0->hasOneUse() && N1->hasOneUse() &&
4277 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
4282 static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
4283 unsigned Opcode = N->getOpcode();
4284 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
4285 SDNode *N0 = N->getOperand(0).getNode();
4286 SDNode *N1 = N->getOperand(1).getNode();
4287 return N0->hasOneUse() && N1->hasOneUse() &&
4288 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
4293 SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
4294 SelectionDAG &DAG) const {
4295 // The rounding mode is in bits 23:22 of the FPSCR.
4296 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
4297 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
4298 // so that the shift + and get folded into a bitfield extract.
4301 SDValue Chain = Op.getOperand(0);
4302 SDValue FPCR_64 = DAG.getNode(
4303 ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
4304 {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
4305 Chain = FPCR_64.getValue(1);
4306 SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
4307 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
4308 DAG.getConstant(1U << 22, dl, MVT::i32));
4309 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
4310 DAG.getConstant(22, dl, MVT::i32));
4311 SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
4312 DAG.getConstant(3, dl, MVT::i32));
4313 return DAG.getMergeValues({AND, Chain}, dl);
4316 SDValue AArch64TargetLowering::LowerSET_ROUNDING(SDValue Op,
4317 SelectionDAG &DAG) const {
4319 SDValue Chain = Op->getOperand(0);
4320 SDValue RMValue = Op->getOperand(1);
4322 // The rounding mode is in bits 23:22 of the FPCR.
4323 // The llvm.set.rounding argument value to the rounding mode in FPCR mapping
4324 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
4325 // ((arg - 1) & 3) << 22).
4327 // The argument of llvm.set.rounding must be within the segment [0, 3], so
4328 // NearestTiesToAway (4) is not handled here. It is responsibility of the code
4329 // generated llvm.set.rounding to ensure this condition.
4331 // Calculate new value of FPCR[23:22].
4332 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
4333 DAG.getConstant(1, DL, MVT::i32));
4334 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
4335 DAG.getConstant(0x3, DL, MVT::i32));
4337 DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
4338 DAG.getConstant(AArch64::RoundingBitsPos, DL, MVT::i32));
4339 RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, RMValue);
4341 // Get current value of FPCR.
4343 Chain, DAG.getTargetConstant(Intrinsic::aarch64_get_fpcr, DL, MVT::i64)};
4345 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i64, MVT::Other}, Ops);
4346 Chain = FPCR.getValue(1);
4347 FPCR = FPCR.getValue(0);
4349 // Put new rounding mode into FPSCR[23:22].
4350 const int RMMask = ~(AArch64::Rounding::rmMask << AArch64::RoundingBitsPos);
4351 FPCR = DAG.getNode(ISD::AND, DL, MVT::i64, FPCR,
4352 DAG.getConstant(RMMask, DL, MVT::i64));
4353 FPCR = DAG.getNode(ISD::OR, DL, MVT::i64, FPCR, RMValue);
4355 Chain, DAG.getTargetConstant(Intrinsic::aarch64_set_fpcr, DL, MVT::i64),
4357 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
4360 SDValue AArch64TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
4361 EVT VT = Op.getValueType();
4363 // If SVE is available then i64 vector multiplications can also be made legal.
4364 bool OverrideNEON = VT == MVT::v2i64 || VT == MVT::v1i64;
4366 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT, OverrideNEON))
4367 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MUL_PRED);
4369 // Multiplications are only custom-lowered for 128-bit vectors so that
4370 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
4371 assert(VT.is128BitVector() && VT.isInteger() &&
4372 "unexpected type for custom-lowering ISD::MUL");
4373 SDNode *N0 = Op.getOperand(0).getNode();
4374 SDNode *N1 = Op.getOperand(1).getNode();
4375 unsigned NewOpc = 0;
4377 bool isN0SExt = isSignExtended(N0, DAG);
4378 bool isN1SExt = isSignExtended(N1, DAG);
4379 if (isN0SExt && isN1SExt)
4380 NewOpc = AArch64ISD::SMULL;
4382 bool isN0ZExt = isZeroExtended(N0, DAG);
4383 bool isN1ZExt = isZeroExtended(N1, DAG);
4384 if (isN0ZExt && isN1ZExt)
4385 NewOpc = AArch64ISD::UMULL;
4386 else if (isN1SExt || isN1ZExt) {
4387 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
4388 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
4389 if (isN1SExt && isAddSubSExt(N0, DAG)) {
4390 NewOpc = AArch64ISD::SMULL;
4392 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
4393 NewOpc = AArch64ISD::UMULL;
4395 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
4397 NewOpc = AArch64ISD::UMULL;
4403 if (VT == MVT::v2i64)
4404 // Fall through to expand this. It is not legal.
4407 // Other vector multiplications are legal.
4412 // Legalize to a S/UMULL instruction
4415 SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
4417 Op0 = skipExtensionForVectorMULL(N0, DAG);
4418 assert(Op0.getValueType().is64BitVector() &&
4419 Op1.getValueType().is64BitVector() &&
4420 "unexpected types for extended operands to VMULL");
4421 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
4423 // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
4424 // isel lowering to take advantage of no-stall back to back s/umul + s/umla.
4425 // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
4426 SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
4427 SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
4428 EVT Op1VT = Op1.getValueType();
4429 return DAG.getNode(N0->getOpcode(), DL, VT,
4430 DAG.getNode(NewOpc, DL, VT,
4431 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
4432 DAG.getNode(NewOpc, DL, VT,
4433 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
4436 static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
4438 if (VT == MVT::nxv1i1 && Pattern == AArch64SVEPredPattern::all)
4439 return DAG.getConstant(1, DL, MVT::nxv1i1);
4440 return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
4441 DAG.getTargetConstant(Pattern, DL, MVT::i32));
4444 // Returns a safe bitcast between two scalable vector predicates, where
4445 // any newly created lanes from a widening bitcast are defined as zero.
4446 static SDValue getSVEPredicateBitCast(EVT VT, SDValue Op, SelectionDAG &DAG) {
4448 EVT InVT = Op.getValueType();
4450 assert(InVT.getVectorElementType() == MVT::i1 &&
4451 VT.getVectorElementType() == MVT::i1 &&
4452 "Expected a predicate-to-predicate bitcast");
4453 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
4454 InVT.isScalableVector() &&
4455 DAG.getTargetLoweringInfo().isTypeLegal(InVT) &&
4456 "Only expect to cast between legal scalable predicate types!");
4458 // Return the operand if the cast isn't changing type,
4459 // e.g. <n x 16 x i1> -> <n x 16 x i1>
4463 SDValue Reinterpret = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
4465 // We only have to zero the lanes if new lanes are being defined, e.g. when
4466 // casting from <vscale x 2 x i1> to <vscale x 16 x i1>. If this is not the
4467 // case (e.g. when casting from <vscale x 16 x i1> -> <vscale x 2 x i1>) then
4468 // we can return here.
4469 if (InVT.bitsGT(VT))
4472 // Check if the other lanes are already known to be zeroed by
4474 if (isZeroingInactiveLanes(Op))
4477 // Zero the newly introduced lanes.
4478 SDValue Mask = DAG.getConstant(1, DL, InVT);
4479 Mask = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Mask);
4480 return DAG.getNode(ISD::AND, DL, VT, Reinterpret, Mask);
4483 SDValue AArch64TargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
4484 SelectionDAG &DAG) const {
4485 unsigned IntNo = Op.getConstantOperandVal(1);
4489 return SDValue(); // Don't custom lower most intrinsics.
4490 case Intrinsic::aarch64_mops_memset_tag: {
4491 auto Node = cast<MemIntrinsicSDNode>(Op.getNode());
4492 SDValue Chain = Node->getChain();
4493 SDValue Dst = Op.getOperand(2);
4494 SDValue Val = Op.getOperand(3);
4495 Val = DAG.getAnyExtOrTrunc(Val, DL, MVT::i64);
4496 SDValue Size = Op.getOperand(4);
4497 auto Alignment = Node->getMemOperand()->getAlign();
4498 bool IsVol = Node->isVolatile();
4499 auto DstPtrInfo = Node->getPointerInfo();
4502 static_cast<const AArch64SelectionDAGInfo &>(DAG.getSelectionDAGInfo());
4504 SDI.EmitMOPS(AArch64ISD::MOPS_MEMSET_TAGGING, DAG, DL, Chain, Dst, Val,
4505 Size, Alignment, IsVol, DstPtrInfo, MachinePointerInfo{});
4507 // MOPS_MEMSET_TAGGING has 3 results (DstWb, SizeWb, Chain) whereas the
4508 // intrinsic has 2. So hide SizeWb using MERGE_VALUES. Otherwise
4509 // LowerOperationWrapper will complain that the number of results has
4511 return DAG.getMergeValues({MS.getValue(0), MS.getValue(2)}, DL);
4513 case Intrinsic::aarch64_sme_get_pstatesm: {
4514 SDValue Chain = Op.getOperand(0);
4515 SDValue MRS = DAG.getNode(
4516 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
4517 Chain, DAG.getConstant(AArch64SysReg::SVCR, DL, MVT::i64));
4518 SDValue Mask = DAG.getConstant(/* PSTATE.SM */ 1, DL, MVT::i64);
4519 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, MRS, Mask);
4520 return DAG.getMergeValues({And, Chain}, DL);
4525 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
4526 SelectionDAG &DAG) const {
4527 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
4530 default: return SDValue(); // Don't custom lower most intrinsics.
4531 case Intrinsic::thread_pointer: {
4532 EVT PtrVT = getPointerTy(DAG.getDataLayout());
4533 return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
4535 case Intrinsic::aarch64_neon_abs: {
4536 EVT Ty = Op.getValueType();
4537 if (Ty == MVT::i64) {
4538 SDValue Result = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64,
4540 Result = DAG.getNode(ISD::ABS, dl, MVT::v1i64, Result);
4541 return DAG.getNode(ISD::BITCAST, dl, MVT::i64, Result);
4542 } else if (Ty.isVector() && Ty.isInteger() && isTypeLegal(Ty)) {
4543 return DAG.getNode(ISD::ABS, dl, Ty, Op.getOperand(1));
4545 report_fatal_error("Unexpected type for AArch64 NEON intrinic");
4548 case Intrinsic::aarch64_neon_pmull64: {
4549 SDValue Op1 = Op.getOperand(1);
4550 SDValue Op2 = Op.getOperand(2);
4552 // If both operands are higher half of two source SIMD & FP registers,
4553 // ISel could make use of tablegen patterns to emit PMULL2. So do not
4554 // legalize i64 to v1i64.
4555 if (areOperandsOfHigherHalf(Op1, Op2))
4558 // As a general convention, use "v1" types to represent scalar integer
4559 // operations in vector registers. This helps ISel to make use of
4560 // tablegen patterns and generate a load into SIMD & FP registers directly.
4561 if (Op1.getValueType() == MVT::i64)
4562 Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op1);
4563 if (Op2.getValueType() == MVT::i64)
4564 Op2 = DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Op2);
4567 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
4568 DAG.getConstant(Intrinsic::aarch64_neon_pmull64, dl, MVT::i32), Op1,
4571 case Intrinsic::aarch64_neon_smax:
4572 return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
4573 Op.getOperand(1), Op.getOperand(2));
4574 case Intrinsic::aarch64_neon_umax:
4575 return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
4576 Op.getOperand(1), Op.getOperand(2));
4577 case Intrinsic::aarch64_neon_smin:
4578 return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
4579 Op.getOperand(1), Op.getOperand(2));
4580 case Intrinsic::aarch64_neon_umin:
4581 return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
4582 Op.getOperand(1), Op.getOperand(2));
4584 case Intrinsic::aarch64_sve_sunpkhi:
4585 return DAG.getNode(AArch64ISD::SUNPKHI, dl, Op.getValueType(),
4587 case Intrinsic::aarch64_sve_sunpklo:
4588 return DAG.getNode(AArch64ISD::SUNPKLO, dl, Op.getValueType(),
4590 case Intrinsic::aarch64_sve_uunpkhi:
4591 return DAG.getNode(AArch64ISD::UUNPKHI, dl, Op.getValueType(),
4593 case Intrinsic::aarch64_sve_uunpklo:
4594 return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
4596 case Intrinsic::aarch64_sve_clasta_n:
4597 return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
4598 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4599 case Intrinsic::aarch64_sve_clastb_n:
4600 return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
4601 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4602 case Intrinsic::aarch64_sve_lasta:
4603 return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
4604 Op.getOperand(1), Op.getOperand(2));
4605 case Intrinsic::aarch64_sve_lastb:
4606 return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
4607 Op.getOperand(1), Op.getOperand(2));
4608 case Intrinsic::aarch64_sve_rev:
4609 return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
4611 case Intrinsic::aarch64_sve_tbl:
4612 return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
4613 Op.getOperand(1), Op.getOperand(2));
4614 case Intrinsic::aarch64_sve_trn1:
4615 return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
4616 Op.getOperand(1), Op.getOperand(2));
4617 case Intrinsic::aarch64_sve_trn2:
4618 return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
4619 Op.getOperand(1), Op.getOperand(2));
4620 case Intrinsic::aarch64_sve_uzp1:
4621 return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
4622 Op.getOperand(1), Op.getOperand(2));
4623 case Intrinsic::aarch64_sve_uzp2:
4624 return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
4625 Op.getOperand(1), Op.getOperand(2));
4626 case Intrinsic::aarch64_sve_zip1:
4627 return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
4628 Op.getOperand(1), Op.getOperand(2));
4629 case Intrinsic::aarch64_sve_zip2:
4630 return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
4631 Op.getOperand(1), Op.getOperand(2));
4632 case Intrinsic::aarch64_sve_splice:
4633 return DAG.getNode(AArch64ISD::SPLICE, dl, Op.getValueType(),
4634 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
4635 case Intrinsic::aarch64_sve_ptrue:
4636 return getPTrue(DAG, dl, Op.getValueType(),
4637 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
4638 case Intrinsic::aarch64_sve_clz:
4639 return DAG.getNode(AArch64ISD::CTLZ_MERGE_PASSTHRU, dl, Op.getValueType(),
4640 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4641 case Intrinsic::aarch64_sme_cntsb:
4642 return DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4643 DAG.getConstant(1, dl, MVT::i32));
4644 case Intrinsic::aarch64_sme_cntsh: {
4645 SDValue One = DAG.getConstant(1, dl, MVT::i32);
4646 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(), One);
4647 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes, One);
4649 case Intrinsic::aarch64_sme_cntsw: {
4650 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4651 DAG.getConstant(1, dl, MVT::i32));
4652 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
4653 DAG.getConstant(2, dl, MVT::i32));
4655 case Intrinsic::aarch64_sme_cntsd: {
4656 SDValue Bytes = DAG.getNode(AArch64ISD::RDSVL, dl, Op.getValueType(),
4657 DAG.getConstant(1, dl, MVT::i32));
4658 return DAG.getNode(ISD::SRL, dl, Op.getValueType(), Bytes,
4659 DAG.getConstant(3, dl, MVT::i32));
4661 case Intrinsic::aarch64_sve_cnt: {
4662 SDValue Data = Op.getOperand(3);
4663 // CTPOP only supports integer operands.
4664 if (Data.getValueType().isFloatingPoint())
4665 Data = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Data);
4666 return DAG.getNode(AArch64ISD::CTPOP_MERGE_PASSTHRU, dl, Op.getValueType(),
4667 Op.getOperand(2), Data, Op.getOperand(1));
4669 case Intrinsic::aarch64_sve_dupq_lane:
4670 return LowerDUPQLane(Op, DAG);
4671 case Intrinsic::aarch64_sve_convert_from_svbool:
4672 return getSVEPredicateBitCast(Op.getValueType(), Op.getOperand(1), DAG);
4673 case Intrinsic::aarch64_sve_convert_to_svbool:
4674 return getSVEPredicateBitCast(MVT::nxv16i1, Op.getOperand(1), DAG);
4675 case Intrinsic::aarch64_sve_fneg:
4676 return DAG.getNode(AArch64ISD::FNEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4677 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4678 case Intrinsic::aarch64_sve_frintp:
4679 return DAG.getNode(AArch64ISD::FCEIL_MERGE_PASSTHRU, dl, Op.getValueType(),
4680 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4681 case Intrinsic::aarch64_sve_frintm:
4682 return DAG.getNode(AArch64ISD::FFLOOR_MERGE_PASSTHRU, dl, Op.getValueType(),
4683 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4684 case Intrinsic::aarch64_sve_frinti:
4685 return DAG.getNode(AArch64ISD::FNEARBYINT_MERGE_PASSTHRU, dl, Op.getValueType(),
4686 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4687 case Intrinsic::aarch64_sve_frintx:
4688 return DAG.getNode(AArch64ISD::FRINT_MERGE_PASSTHRU, dl, Op.getValueType(),
4689 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4690 case Intrinsic::aarch64_sve_frinta:
4691 return DAG.getNode(AArch64ISD::FROUND_MERGE_PASSTHRU, dl, Op.getValueType(),
4692 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4693 case Intrinsic::aarch64_sve_frintn:
4694 return DAG.getNode(AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU, dl, Op.getValueType(),
4695 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4696 case Intrinsic::aarch64_sve_frintz:
4697 return DAG.getNode(AArch64ISD::FTRUNC_MERGE_PASSTHRU, dl, Op.getValueType(),
4698 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4699 case Intrinsic::aarch64_sve_ucvtf:
4700 return DAG.getNode(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU, dl,
4701 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4703 case Intrinsic::aarch64_sve_scvtf:
4704 return DAG.getNode(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU, dl,
4705 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4707 case Intrinsic::aarch64_sve_fcvtzu:
4708 return DAG.getNode(AArch64ISD::FCVTZU_MERGE_PASSTHRU, dl,
4709 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4711 case Intrinsic::aarch64_sve_fcvtzs:
4712 return DAG.getNode(AArch64ISD::FCVTZS_MERGE_PASSTHRU, dl,
4713 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4715 case Intrinsic::aarch64_sve_fsqrt:
4716 return DAG.getNode(AArch64ISD::FSQRT_MERGE_PASSTHRU, dl, Op.getValueType(),
4717 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4718 case Intrinsic::aarch64_sve_frecpx:
4719 return DAG.getNode(AArch64ISD::FRECPX_MERGE_PASSTHRU, dl, Op.getValueType(),
4720 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4721 case Intrinsic::aarch64_sve_frecpe_x:
4722 return DAG.getNode(AArch64ISD::FRECPE, dl, Op.getValueType(),
4724 case Intrinsic::aarch64_sve_frecps_x:
4725 return DAG.getNode(AArch64ISD::FRECPS, dl, Op.getValueType(),
4726 Op.getOperand(1), Op.getOperand(2));
4727 case Intrinsic::aarch64_sve_frsqrte_x:
4728 return DAG.getNode(AArch64ISD::FRSQRTE, dl, Op.getValueType(),
4730 case Intrinsic::aarch64_sve_frsqrts_x:
4731 return DAG.getNode(AArch64ISD::FRSQRTS, dl, Op.getValueType(),
4732 Op.getOperand(1), Op.getOperand(2));
4733 case Intrinsic::aarch64_sve_fabs:
4734 return DAG.getNode(AArch64ISD::FABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4735 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4736 case Intrinsic::aarch64_sve_abs:
4737 return DAG.getNode(AArch64ISD::ABS_MERGE_PASSTHRU, dl, Op.getValueType(),
4738 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4739 case Intrinsic::aarch64_sve_neg:
4740 return DAG.getNode(AArch64ISD::NEG_MERGE_PASSTHRU, dl, Op.getValueType(),
4741 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4742 case Intrinsic::aarch64_sve_insr: {
4743 SDValue Scalar = Op.getOperand(2);
4744 EVT ScalarTy = Scalar.getValueType();
4745 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
4746 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
4748 return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
4749 Op.getOperand(1), Scalar);
4751 case Intrinsic::aarch64_sve_rbit:
4752 return DAG.getNode(AArch64ISD::BITREVERSE_MERGE_PASSTHRU, dl,
4753 Op.getValueType(), Op.getOperand(2), Op.getOperand(3),
4755 case Intrinsic::aarch64_sve_revb:
4756 return DAG.getNode(AArch64ISD::BSWAP_MERGE_PASSTHRU, dl, Op.getValueType(),
4757 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4758 case Intrinsic::aarch64_sve_revh:
4759 return DAG.getNode(AArch64ISD::REVH_MERGE_PASSTHRU, dl, Op.getValueType(),
4760 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4761 case Intrinsic::aarch64_sve_revw:
4762 return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(),
4763 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4764 case Intrinsic::aarch64_sve_revd:
4765 return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(),
4766 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
4767 case Intrinsic::aarch64_sve_sxtb:
4769 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4770 Op.getOperand(2), Op.getOperand(3),
4771 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4773 case Intrinsic::aarch64_sve_sxth:
4775 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4776 Op.getOperand(2), Op.getOperand(3),
4777 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4779 case Intrinsic::aarch64_sve_sxtw:
4781 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4782 Op.getOperand(2), Op.getOperand(3),
4783 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4785 case Intrinsic::aarch64_sve_uxtb:
4787 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4788 Op.getOperand(2), Op.getOperand(3),
4789 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i8)),
4791 case Intrinsic::aarch64_sve_uxth:
4793 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4794 Op.getOperand(2), Op.getOperand(3),
4795 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i16)),
4797 case Intrinsic::aarch64_sve_uxtw:
4799 AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(),
4800 Op.getOperand(2), Op.getOperand(3),
4801 DAG.getValueType(Op.getValueType().changeVectorElementType(MVT::i32)),
4803 case Intrinsic::localaddress: {
4804 const auto &MF = DAG.getMachineFunction();
4805 const auto *RegInfo = Subtarget->getRegisterInfo();
4806 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
4807 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
4808 Op.getSimpleValueType());
4811 case Intrinsic::eh_recoverfp: {
4812 // FIXME: This needs to be implemented to correctly handle highly aligned
4813 // stack objects. For now we simply return the incoming FP. Refer D53541
4814 // for more details.
4815 SDValue FnOp = Op.getOperand(1);
4816 SDValue IncomingFPOp = Op.getOperand(2);
4817 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
4818 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
4821 "llvm.eh.recoverfp must take a function as the first argument");
4822 return IncomingFPOp;
4825 case Intrinsic::aarch64_neon_vsri:
4826 case Intrinsic::aarch64_neon_vsli: {
4827 EVT Ty = Op.getValueType();
4830 report_fatal_error("Unexpected type for aarch64_neon_vsli");
4832 assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
4834 bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
4835 unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
4836 return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
4840 case Intrinsic::aarch64_neon_srhadd:
4841 case Intrinsic::aarch64_neon_urhadd:
4842 case Intrinsic::aarch64_neon_shadd:
4843 case Intrinsic::aarch64_neon_uhadd: {
4844 bool IsSignedAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4845 IntNo == Intrinsic::aarch64_neon_shadd);
4846 bool IsRoundingAdd = (IntNo == Intrinsic::aarch64_neon_srhadd ||
4847 IntNo == Intrinsic::aarch64_neon_urhadd);
4848 unsigned Opcode = IsSignedAdd
4849 ? (IsRoundingAdd ? ISD::AVGCEILS : ISD::AVGFLOORS)
4850 : (IsRoundingAdd ? ISD::AVGCEILU : ISD::AVGFLOORU);
4851 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4854 case Intrinsic::aarch64_neon_sabd:
4855 case Intrinsic::aarch64_neon_uabd: {
4856 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uabd ? ISD::ABDU
4858 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4861 case Intrinsic::aarch64_neon_saddlp:
4862 case Intrinsic::aarch64_neon_uaddlp: {
4863 unsigned Opcode = IntNo == Intrinsic::aarch64_neon_uaddlp
4864 ? AArch64ISD::UADDLP
4865 : AArch64ISD::SADDLP;
4866 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1));
4868 case Intrinsic::aarch64_neon_sdot:
4869 case Intrinsic::aarch64_neon_udot:
4870 case Intrinsic::aarch64_sve_sdot:
4871 case Intrinsic::aarch64_sve_udot: {
4872 unsigned Opcode = (IntNo == Intrinsic::aarch64_neon_udot ||
4873 IntNo == Intrinsic::aarch64_sve_udot)
4876 return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
4877 Op.getOperand(2), Op.getOperand(3));
4879 case Intrinsic::get_active_lane_mask: {
4881 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, MVT::i64);
4882 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(), ID,
4883 Op.getOperand(1), Op.getOperand(2));
4888 bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const {
4889 if (VT.getVectorElementType() == MVT::i8 ||
4890 VT.getVectorElementType() == MVT::i16) {
4897 bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT IndexVT,
4899 // SVE only supports implicit extension of 32-bit indices.
4900 if (!Subtarget->hasSVE() || IndexVT.getVectorElementType() != MVT::i32)
4903 // Indices cannot be smaller than the main data type.
4904 if (IndexVT.getScalarSizeInBits() < DataVT.getScalarSizeInBits())
4907 // Scalable vectors with "vscale * 2" or fewer elements sit within a 64-bit
4908 // element container type, which would violate the previous clause.
4909 return DataVT.isFixedLengthVector() || DataVT.getVectorMinNumElements() > 2;
4912 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
4913 return ExtVal.getValueType().isScalableVector() ||
4914 useSVEForFixedLengthVectorVT(
4915 ExtVal.getValueType(),
4916 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors());
4919 unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
4920 std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
4921 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
4922 AArch64ISD::GLD1_MERGE_ZERO},
4923 {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
4924 AArch64ISD::GLD1_UXTW_MERGE_ZERO},
4925 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
4926 AArch64ISD::GLD1_MERGE_ZERO},
4927 {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
4928 AArch64ISD::GLD1_SXTW_MERGE_ZERO},
4929 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
4930 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
4931 {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
4932 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
4933 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
4934 AArch64ISD::GLD1_SCALED_MERGE_ZERO},
4935 {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
4936 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
4938 auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
4939 return AddrModes.find(Key)->second;
4942 unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
4945 llvm_unreachable("unimplemented opcode");
4947 case AArch64ISD::GLD1_MERGE_ZERO:
4948 return AArch64ISD::GLD1S_MERGE_ZERO;
4949 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
4950 return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
4951 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
4952 return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
4953 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
4954 return AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
4955 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
4956 return AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
4957 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
4958 return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
4959 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
4960 return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
4964 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
4965 SelectionDAG &DAG) const {
4966 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
4969 SDValue Chain = MGT->getChain();
4970 SDValue PassThru = MGT->getPassThru();
4971 SDValue Mask = MGT->getMask();
4972 SDValue BasePtr = MGT->getBasePtr();
4973 SDValue Index = MGT->getIndex();
4974 SDValue Scale = MGT->getScale();
4975 EVT VT = Op.getValueType();
4976 EVT MemVT = MGT->getMemoryVT();
4977 ISD::LoadExtType ExtType = MGT->getExtensionType();
4978 ISD::MemIndexType IndexType = MGT->getIndexType();
4980 // SVE supports zero (and so undef) passthrough values only, everything else
4981 // must be handled manually by an explicit select on the load's output.
4982 if (!PassThru->isUndef() && !isZerosVector(PassThru.getNode())) {
4983 SDValue Ops[] = {Chain, DAG.getUNDEF(VT), Mask, BasePtr, Index, Scale};
4985 DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
4986 MGT->getMemOperand(), IndexType, ExtType);
4987 SDValue Select = DAG.getSelect(DL, VT, Mask, Load, PassThru);
4988 return DAG.getMergeValues({Select, Load.getValue(1)}, DL);
4991 bool IsScaled = MGT->isIndexScaled();
4992 bool IsSigned = MGT->isIndexSigned();
4994 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
4995 // must be calculated before hand.
4996 uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
4997 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
4998 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
4999 EVT IndexVT = Index.getValueType();
5000 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5001 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5002 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5004 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5005 return DAG.getMaskedGather(MGT->getVTList(), MemVT, DL, Ops,
5006 MGT->getMemOperand(), IndexType, ExtType);
5009 // Lower fixed length gather to a scalable equivalent.
5010 if (VT.isFixedLengthVector()) {
5011 assert(Subtarget->useSVEForFixedLengthVectors() &&
5012 "Cannot lower when not using SVE for fixed vectors!");
5014 // NOTE: Handle floating-point as if integer then bitcast the result.
5015 EVT DataVT = VT.changeVectorElementTypeToInteger();
5016 MemVT = MemVT.changeVectorElementTypeToInteger();
5018 // Find the smallest integer fixed length vector we can use for the gather.
5019 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5020 if (DataVT.getVectorElementType() == MVT::i64 ||
5021 Index.getValueType().getVectorElementType() == MVT::i64 ||
5022 Mask.getValueType().getVectorElementType() == MVT::i64)
5023 PromotedVT = VT.changeVectorElementType(MVT::i64);
5025 // Promote vector operands except for passthrough, which we know is either
5026 // undef or zero, and thus best constructed directly.
5027 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5028 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5029 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5031 // A promoted result type forces the need for an extending load.
5032 if (PromotedVT != DataVT && ExtType == ISD::NON_EXTLOAD)
5033 ExtType = ISD::EXTLOAD;
5035 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5037 // Convert fixed length vector operands to scalable.
5038 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5039 Index = convertToScalableVector(DAG, ContainerVT, Index);
5040 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5041 PassThru = PassThru->isUndef() ? DAG.getUNDEF(ContainerVT)
5042 : DAG.getConstant(0, DL, ContainerVT);
5044 // Emit equivalent scalable vector gather.
5045 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
5047 DAG.getMaskedGather(DAG.getVTList(ContainerVT, MVT::Other), MemVT, DL,
5048 Ops, MGT->getMemOperand(), IndexType, ExtType);
5050 // Extract fixed length data then convert to the required result type.
5051 SDValue Result = convertFromScalableVector(DAG, PromotedVT, Load);
5052 Result = DAG.getNode(ISD::TRUNCATE, DL, DataVT, Result);
5053 if (VT.isFloatingPoint())
5054 Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
5056 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5059 // Everything else is legal.
5063 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
5064 SelectionDAG &DAG) const {
5065 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
5068 SDValue Chain = MSC->getChain();
5069 SDValue StoreVal = MSC->getValue();
5070 SDValue Mask = MSC->getMask();
5071 SDValue BasePtr = MSC->getBasePtr();
5072 SDValue Index = MSC->getIndex();
5073 SDValue Scale = MSC->getScale();
5074 EVT VT = StoreVal.getValueType();
5075 EVT MemVT = MSC->getMemoryVT();
5076 ISD::MemIndexType IndexType = MSC->getIndexType();
5077 bool Truncating = MSC->isTruncatingStore();
5079 bool IsScaled = MSC->isIndexScaled();
5080 bool IsSigned = MSC->isIndexSigned();
5082 // SVE supports an index scaled by sizeof(MemVT.elt) only, everything else
5083 // must be calculated before hand.
5084 uint64_t ScaleVal = cast<ConstantSDNode>(Scale)->getZExtValue();
5085 if (IsScaled && ScaleVal != MemVT.getScalarStoreSize()) {
5086 assert(isPowerOf2_64(ScaleVal) && "Expecting power-of-two types");
5087 EVT IndexVT = Index.getValueType();
5088 Index = DAG.getNode(ISD::SHL, DL, IndexVT, Index,
5089 DAG.getConstant(Log2_32(ScaleVal), DL, IndexVT));
5090 Scale = DAG.getTargetConstant(1, DL, Scale.getValueType());
5092 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5093 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5094 MSC->getMemOperand(), IndexType, Truncating);
5097 // Lower fixed length scatter to a scalable equivalent.
5098 if (VT.isFixedLengthVector()) {
5099 assert(Subtarget->useSVEForFixedLengthVectors() &&
5100 "Cannot lower when not using SVE for fixed vectors!");
5102 // Once bitcast we treat floating-point scatters as if integer.
5103 if (VT.isFloatingPoint()) {
5104 VT = VT.changeVectorElementTypeToInteger();
5105 MemVT = MemVT.changeVectorElementTypeToInteger();
5106 StoreVal = DAG.getNode(ISD::BITCAST, DL, VT, StoreVal);
5109 // Find the smallest integer fixed length vector we can use for the scatter.
5110 EVT PromotedVT = VT.changeVectorElementType(MVT::i32);
5111 if (VT.getVectorElementType() == MVT::i64 ||
5112 Index.getValueType().getVectorElementType() == MVT::i64 ||
5113 Mask.getValueType().getVectorElementType() == MVT::i64)
5114 PromotedVT = VT.changeVectorElementType(MVT::i64);
5116 // Promote vector operands.
5117 unsigned ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
5118 Index = DAG.getNode(ExtOpcode, DL, PromotedVT, Index);
5119 Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, PromotedVT, Mask);
5120 StoreVal = DAG.getNode(ISD::ANY_EXTEND, DL, PromotedVT, StoreVal);
5122 // A promoted value type forces the need for a truncating store.
5123 if (PromotedVT != VT)
5126 EVT ContainerVT = getContainerForFixedLengthVector(DAG, PromotedVT);
5128 // Convert fixed length vector operands to scalable.
5129 MemVT = ContainerVT.changeVectorElementType(MemVT.getVectorElementType());
5130 Index = convertToScalableVector(DAG, ContainerVT, Index);
5131 Mask = convertFixedMaskToScalableVector(Mask, DAG);
5132 StoreVal = convertToScalableVector(DAG, ContainerVT, StoreVal);
5134 // Emit equivalent scalable vector scatter.
5135 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
5136 return DAG.getMaskedScatter(MSC->getVTList(), MemVT, DL, Ops,
5137 MSC->getMemOperand(), IndexType, Truncating);
5140 // Everything else is legal.
5144 SDValue AArch64TargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
5146 MaskedLoadSDNode *LoadNode = cast<MaskedLoadSDNode>(Op);
5147 assert(LoadNode && "Expected custom lowering of a masked load node");
5148 EVT VT = Op->getValueType(0);
5150 if (useSVEForFixedLengthVectorVT(
5152 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5153 return LowerFixedLengthVectorMLoadToSVE(Op, DAG);
5155 SDValue PassThru = LoadNode->getPassThru();
5156 SDValue Mask = LoadNode->getMask();
5158 if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
5161 SDValue Load = DAG.getMaskedLoad(
5162 VT, DL, LoadNode->getChain(), LoadNode->getBasePtr(),
5163 LoadNode->getOffset(), Mask, DAG.getUNDEF(VT), LoadNode->getMemoryVT(),
5164 LoadNode->getMemOperand(), LoadNode->getAddressingMode(),
5165 LoadNode->getExtensionType());
5167 SDValue Result = DAG.getSelect(DL, VT, Mask, Load, PassThru);
5169 return DAG.getMergeValues({Result, Load.getValue(1)}, DL);
5172 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
5173 static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
5175 SelectionDAG &DAG) {
5176 assert(VT.isVector() && "VT should be a vector type");
5177 assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
5179 SDValue Value = ST->getValue();
5181 // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
5182 // the word lane which represent the v4i8 subvector. It optimizes the store
5188 SDValue Undef = DAG.getUNDEF(MVT::i16);
5189 SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
5190 {Undef, Undef, Undef, Undef});
5192 SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
5194 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
5196 Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
5197 SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
5198 Trunc, DAG.getConstant(0, DL, MVT::i64));
5200 return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
5201 ST->getBasePtr(), ST->getMemOperand());
5204 // Custom lowering for any store, vector or scalar and/or default or with
5205 // a truncate operations. Currently only custom lower truncate operation
5206 // from vector v4i16 to v4i8 or volatile stores of i128.
5207 SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
5208 SelectionDAG &DAG) const {
5210 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
5211 assert (StoreNode && "Can only custom lower store nodes");
5213 SDValue Value = StoreNode->getValue();
5215 EVT VT = Value.getValueType();
5216 EVT MemVT = StoreNode->getMemoryVT();
5218 if (VT.isVector()) {
5219 if (useSVEForFixedLengthVectorVT(
5221 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
5222 return LowerFixedLengthVectorStoreToSVE(Op, DAG);
5224 unsigned AS = StoreNode->getAddressSpace();
5225 Align Alignment = StoreNode->getAlign();
5226 if (Alignment < MemVT.getStoreSize() &&
5227 !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment,
5228 StoreNode->getMemOperand()->getFlags(),
5230 return scalarizeVectorStore(StoreNode, DAG);
5233 if (StoreNode->isTruncatingStore() && VT == MVT::v4i16 &&
5234 MemVT == MVT::v4i8) {
5235 return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
5237 // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
5238 // the custom lowering, as there are no un-paired non-temporal stores and
5239 // legalization will break up 256 bit inputs.
5240 ElementCount EC = MemVT.getVectorElementCount();
5241 if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
5243 ((MemVT.getScalarSizeInBits() == 8u ||
5244 MemVT.getScalarSizeInBits() == 16u ||
5245 MemVT.getScalarSizeInBits() == 32u ||
5246 MemVT.getScalarSizeInBits() == 64u))) {
5248 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5249 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5250 StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
5252 DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
5253 MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
5254 StoreNode->getValue(),
5255 DAG.getConstant(EC.getKnownMinValue() / 2, Dl, MVT::i64));
5256 SDValue Result = DAG.getMemIntrinsicNode(
5257 AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
5258 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5259 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5262 } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
5263 return LowerStore128(Op, DAG);
5264 } else if (MemVT == MVT::i64x8) {
5265 SDValue Value = StoreNode->getValue();
5266 assert(Value->getValueType(0) == MVT::i64x8);
5267 SDValue Chain = StoreNode->getChain();
5268 SDValue Base = StoreNode->getBasePtr();
5269 EVT PtrVT = Base.getValueType();
5270 for (unsigned i = 0; i < 8; i++) {
5271 SDValue Part = DAG.getNode(AArch64ISD::LS64_EXTRACT, Dl, MVT::i64,
5272 Value, DAG.getConstant(i, Dl, MVT::i32));
5273 SDValue Ptr = DAG.getNode(ISD::ADD, Dl, PtrVT, Base,
5274 DAG.getConstant(i * 8, Dl, PtrVT));
5275 Chain = DAG.getStore(Chain, Dl, Part, Ptr, StoreNode->getPointerInfo(),
5276 StoreNode->getOriginalAlign());
5284 /// Lower atomic or volatile 128-bit stores to a single STP instruction.
5285 SDValue AArch64TargetLowering::LowerStore128(SDValue Op,
5286 SelectionDAG &DAG) const {
5287 MemSDNode *StoreNode = cast<MemSDNode>(Op);
5288 assert(StoreNode->getMemoryVT() == MVT::i128);
5289 assert(StoreNode->isVolatile() || StoreNode->isAtomic());
5290 assert(!StoreNode->isAtomic() ||
5291 StoreNode->getMergedOrdering() == AtomicOrdering::Unordered ||
5292 StoreNode->getMergedOrdering() == AtomicOrdering::Monotonic);
5294 SDValue Value = StoreNode->getOpcode() == ISD::STORE
5295 ? StoreNode->getOperand(1)
5296 : StoreNode->getOperand(2);
5298 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5299 DAG.getConstant(0, DL, MVT::i64));
5300 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, Value,
5301 DAG.getConstant(1, DL, MVT::i64));
5302 SDValue Result = DAG.getMemIntrinsicNode(
5303 AArch64ISD::STP, DL, DAG.getVTList(MVT::Other),
5304 {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
5305 StoreNode->getMemoryVT(), StoreNode->getMemOperand());
5309 SDValue AArch64TargetLowering::LowerLOAD(SDValue Op,
5310 SelectionDAG &DAG) const {
5312 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
5313 assert(LoadNode && "Expected custom lowering of a load node");
5315 if (LoadNode->getMemoryVT() == MVT::i64x8) {
5316 SmallVector<SDValue, 8> Ops;
5317 SDValue Base = LoadNode->getBasePtr();
5318 SDValue Chain = LoadNode->getChain();
5319 EVT PtrVT = Base.getValueType();
5320 for (unsigned i = 0; i < 8; i++) {
5321 SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, Base,
5322 DAG.getConstant(i * 8, DL, PtrVT));
5323 SDValue Part = DAG.getLoad(MVT::i64, DL, Chain, Ptr,
5324 LoadNode->getPointerInfo(),
5325 LoadNode->getOriginalAlign());
5326 Ops.push_back(Part);
5327 Chain = SDValue(Part.getNode(), 1);
5329 SDValue Loaded = DAG.getNode(AArch64ISD::LS64_BUILD, DL, MVT::i64x8, Ops);
5330 return DAG.getMergeValues({Loaded, Chain}, DL);
5333 // Custom lowering for extending v4i8 vector loads.
5334 EVT VT = Op->getValueType(0);
5335 assert((VT == MVT::v4i16 || VT == MVT::v4i32) && "Expected v4i16 or v4i32");
5337 if (LoadNode->getMemoryVT() != MVT::v4i8)
5341 if (LoadNode->getExtensionType() == ISD::SEXTLOAD)
5342 ExtType = ISD::SIGN_EXTEND;
5343 else if (LoadNode->getExtensionType() == ISD::ZEXTLOAD ||
5344 LoadNode->getExtensionType() == ISD::EXTLOAD)
5345 ExtType = ISD::ZERO_EXTEND;
5349 SDValue Load = DAG.getLoad(MVT::f32, DL, LoadNode->getChain(),
5350 LoadNode->getBasePtr(), MachinePointerInfo());
5351 SDValue Chain = Load.getValue(1);
5352 SDValue Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f32, Load);
5353 SDValue BC = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Vec);
5354 SDValue Ext = DAG.getNode(ExtType, DL, MVT::v8i16, BC);
5355 Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i16, Ext,
5356 DAG.getConstant(0, DL, MVT::i64));
5357 if (VT == MVT::v4i32)
5358 Ext = DAG.getNode(ExtType, DL, MVT::v4i32, Ext);
5359 return DAG.getMergeValues({Ext, Chain}, DL);
5362 // Generate SUBS and CSEL for integer abs.
5363 SDValue AArch64TargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5364 MVT VT = Op.getSimpleValueType();
5367 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABS_MERGE_PASSTHRU);
5370 SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
5372 // Generate SUBS & CSEL.
5374 DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
5375 Op.getOperand(0), DAG.getConstant(0, DL, VT));
5376 return DAG.getNode(AArch64ISD::CSEL, DL, VT, Op.getOperand(0), Neg,
5377 DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
5381 static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
5382 SDValue Chain = Op.getOperand(0);
5383 SDValue Cond = Op.getOperand(1);
5384 SDValue Dest = Op.getOperand(2);
5386 AArch64CC::CondCode CC;
5387 if (SDValue Cmp = emitConjunction(DAG, Cond, CC)) {
5389 SDValue CCVal = DAG.getConstant(CC, dl, MVT::i32);
5390 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
5397 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
5398 SelectionDAG &DAG) const {
5399 LLVM_DEBUG(dbgs() << "Custom lowering: ");
5400 LLVM_DEBUG(Op.dump());
5402 switch (Op.getOpcode()) {
5404 llvm_unreachable("unimplemented operand");
5407 return LowerBITCAST(Op, DAG);
5408 case ISD::GlobalAddress:
5409 return LowerGlobalAddress(Op, DAG);
5410 case ISD::GlobalTLSAddress:
5411 return LowerGlobalTLSAddress(Op, DAG);
5413 case ISD::STRICT_FSETCC:
5414 case ISD::STRICT_FSETCCS:
5415 return LowerSETCC(Op, DAG);
5417 return LowerBRCOND(Op, DAG);
5419 return LowerBR_CC(Op, DAG);
5421 return LowerSELECT(Op, DAG);
5422 case ISD::SELECT_CC:
5423 return LowerSELECT_CC(Op, DAG);
5424 case ISD::JumpTable:
5425 return LowerJumpTable(Op, DAG);
5427 return LowerBR_JT(Op, DAG);
5428 case ISD::ConstantPool:
5429 return LowerConstantPool(Op, DAG);
5430 case ISD::BlockAddress:
5431 return LowerBlockAddress(Op, DAG);
5433 return LowerVASTART(Op, DAG);
5435 return LowerVACOPY(Op, DAG);
5437 return LowerVAARG(Op, DAG);
5439 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, false /*unsigned*/);
5441 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, false /*unsigned*/);
5442 case ISD::SADDO_CARRY:
5443 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::ADCS, true /*signed*/);
5444 case ISD::SSUBO_CARRY:
5445 return lowerADDSUBCARRY(Op, DAG, AArch64ISD::SBCS, true /*signed*/);
5452 return LowerXALUO(Op, DAG);
5454 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
5456 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSUB_PRED);
5458 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMUL_PRED);
5460 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
5462 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FDIV_PRED);
5464 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEG_MERGE_PASSTHRU);
5466 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FCEIL_MERGE_PASSTHRU);
5468 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FFLOOR_MERGE_PASSTHRU);
5469 case ISD::FNEARBYINT:
5470 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FNEARBYINT_MERGE_PASSTHRU);
5472 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FRINT_MERGE_PASSTHRU);
5474 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUND_MERGE_PASSTHRU);
5475 case ISD::FROUNDEVEN:
5476 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FROUNDEVEN_MERGE_PASSTHRU);
5478 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FTRUNC_MERGE_PASSTHRU);
5480 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FSQRT_MERGE_PASSTHRU);
5482 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FABS_MERGE_PASSTHRU);
5484 case ISD::STRICT_FP_ROUND:
5485 return LowerFP_ROUND(Op, DAG);
5486 case ISD::FP_EXTEND:
5487 return LowerFP_EXTEND(Op, DAG);
5488 case ISD::FRAMEADDR:
5489 return LowerFRAMEADDR(Op, DAG);
5490 case ISD::SPONENTRY:
5491 return LowerSPONENTRY(Op, DAG);
5492 case ISD::RETURNADDR:
5493 return LowerRETURNADDR(Op, DAG);
5494 case ISD::ADDROFRETURNADDR:
5495 return LowerADDROFRETURNADDR(Op, DAG);
5496 case ISD::CONCAT_VECTORS:
5497 return LowerCONCAT_VECTORS(Op, DAG);
5498 case ISD::INSERT_VECTOR_ELT:
5499 return LowerINSERT_VECTOR_ELT(Op, DAG);
5500 case ISD::EXTRACT_VECTOR_ELT:
5501 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
5502 case ISD::BUILD_VECTOR:
5503 return LowerBUILD_VECTOR(Op, DAG);
5504 case ISD::VECTOR_SHUFFLE:
5505 return LowerVECTOR_SHUFFLE(Op, DAG);
5506 case ISD::SPLAT_VECTOR:
5507 return LowerSPLAT_VECTOR(Op, DAG);
5508 case ISD::EXTRACT_SUBVECTOR:
5509 return LowerEXTRACT_SUBVECTOR(Op, DAG);
5510 case ISD::INSERT_SUBVECTOR:
5511 return LowerINSERT_SUBVECTOR(Op, DAG);
5514 return LowerDIV(Op, DAG);
5519 return LowerMinMax(Op, DAG);
5523 return LowerVectorSRA_SRL_SHL(Op, DAG);
5524 case ISD::SHL_PARTS:
5525 case ISD::SRL_PARTS:
5526 case ISD::SRA_PARTS:
5527 return LowerShiftParts(Op, DAG);
5530 return LowerCTPOP_PARITY(Op, DAG);
5531 case ISD::FCOPYSIGN:
5532 return LowerFCOPYSIGN(Op, DAG);
5534 return LowerVectorOR(Op, DAG);
5536 return LowerXOR(Op, DAG);
5538 return LowerPREFETCH(Op, DAG);
5539 case ISD::SINT_TO_FP:
5540 case ISD::UINT_TO_FP:
5541 case ISD::STRICT_SINT_TO_FP:
5542 case ISD::STRICT_UINT_TO_FP:
5543 return LowerINT_TO_FP(Op, DAG);
5544 case ISD::FP_TO_SINT:
5545 case ISD::FP_TO_UINT:
5546 case ISD::STRICT_FP_TO_SINT:
5547 case ISD::STRICT_FP_TO_UINT:
5548 return LowerFP_TO_INT(Op, DAG);
5549 case ISD::FP_TO_SINT_SAT:
5550 case ISD::FP_TO_UINT_SAT:
5551 return LowerFP_TO_INT_SAT(Op, DAG);
5553 return LowerFSINCOS(Op, DAG);
5554 case ISD::FLT_ROUNDS_:
5555 return LowerFLT_ROUNDS_(Op, DAG);
5556 case ISD::SET_ROUNDING:
5557 return LowerSET_ROUNDING(Op, DAG);
5559 return LowerMUL(Op, DAG);
5561 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHS_PRED);
5563 return LowerToPredicatedOp(Op, DAG, AArch64ISD::MULHU_PRED);
5564 case ISD::INTRINSIC_W_CHAIN:
5565 return LowerINTRINSIC_W_CHAIN(Op, DAG);
5566 case ISD::INTRINSIC_WO_CHAIN:
5567 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5568 case ISD::ATOMIC_STORE:
5569 if (cast<MemSDNode>(Op)->getMemoryVT() == MVT::i128) {
5570 assert(Subtarget->hasLSE2());
5571 return LowerStore128(Op, DAG);
5575 return LowerSTORE(Op, DAG);
5577 return LowerFixedLengthVectorMStoreToSVE(Op, DAG);
5579 return LowerMGATHER(Op, DAG);
5581 return LowerMSCATTER(Op, DAG);
5582 case ISD::VECREDUCE_SEQ_FADD:
5583 return LowerVECREDUCE_SEQ_FADD(Op, DAG);
5584 case ISD::VECREDUCE_ADD:
5585 case ISD::VECREDUCE_AND:
5586 case ISD::VECREDUCE_OR:
5587 case ISD::VECREDUCE_XOR:
5588 case ISD::VECREDUCE_SMAX:
5589 case ISD::VECREDUCE_SMIN:
5590 case ISD::VECREDUCE_UMAX:
5591 case ISD::VECREDUCE_UMIN:
5592 case ISD::VECREDUCE_FADD:
5593 case ISD::VECREDUCE_FMAX:
5594 case ISD::VECREDUCE_FMIN:
5595 return LowerVECREDUCE(Op, DAG);
5596 case ISD::ATOMIC_LOAD_SUB:
5597 return LowerATOMIC_LOAD_SUB(Op, DAG);
5598 case ISD::ATOMIC_LOAD_AND:
5599 return LowerATOMIC_LOAD_AND(Op, DAG);
5600 case ISD::DYNAMIC_STACKALLOC:
5601 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5603 return LowerVSCALE(Op, DAG);
5604 case ISD::ANY_EXTEND:
5605 case ISD::SIGN_EXTEND:
5606 case ISD::ZERO_EXTEND:
5607 return LowerFixedLengthVectorIntExtendToSVE(Op, DAG);
5608 case ISD::SIGN_EXTEND_INREG: {
5609 // Only custom lower when ExtraVT has a legal byte based element type.
5610 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5611 EVT ExtraEltVT = ExtraVT.getVectorElementType();
5612 if ((ExtraEltVT != MVT::i8) && (ExtraEltVT != MVT::i16) &&
5613 (ExtraEltVT != MVT::i32) && (ExtraEltVT != MVT::i64))
5616 return LowerToPredicatedOp(Op, DAG,
5617 AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU);
5620 return LowerTRUNCATE(Op, DAG);
5622 return LowerMLOAD(Op, DAG);
5624 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
5625 return LowerFixedLengthVectorLoadToSVE(Op, DAG);
5626 return LowerLOAD(Op, DAG);
5630 return LowerToScalableOp(Op, DAG);
5632 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAX_PRED);
5634 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMAXNM_PRED);
5636 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMIN_PRED);
5638 return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMINNM_PRED);
5640 return LowerFixedLengthVectorSelectToSVE(Op, DAG);
5642 return LowerABS(Op, DAG);
5644 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDS_PRED);
5646 return LowerToPredicatedOp(Op, DAG, AArch64ISD::ABDU_PRED);
5647 case ISD::BITREVERSE:
5648 return LowerBitreverse(Op, DAG);
5650 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BSWAP_MERGE_PASSTHRU);
5652 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTLZ_MERGE_PASSTHRU);
5654 return LowerCTTZ(Op, DAG);
5655 case ISD::VECTOR_SPLICE:
5656 return LowerVECTOR_SPLICE(Op, DAG);
5657 case ISD::STRICT_LROUND:
5658 case ISD::STRICT_LLROUND:
5659 case ISD::STRICT_LRINT:
5660 case ISD::STRICT_LLRINT: {
5661 assert(Op.getOperand(1).getValueType() == MVT::f16 &&
5662 "Expected custom lowering of rounding operations only for f16");
5664 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
5665 {Op.getOperand(0), Op.getOperand(1)});
5666 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
5667 {Ext.getValue(1), Ext.getValue(0)});
5672 bool AArch64TargetLowering::mergeStoresAfterLegalization(EVT VT) const {
5673 return !Subtarget->useSVEForFixedLengthVectors();
5676 bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(
5677 EVT VT, bool OverrideNEON) const {
5678 if (!VT.isFixedLengthVector() || !VT.isSimple())
5681 // Don't use SVE for vectors we cannot scalarize if required.
5682 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
5683 // Fixed length predicates should be promoted to i8.
5684 // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
5698 // All SVE implementations support NEON sized vectors.
5699 if (OverrideNEON && (VT.is128BitVector() || VT.is64BitVector()))
5700 return Subtarget->hasSVE();
5702 // Ensure NEON MVTs only belong to a single register class.
5703 if (VT.getFixedSizeInBits() <= 128)
5706 // Ensure wider than NEON code generation is enabled.
5707 if (!Subtarget->useSVEForFixedLengthVectors())
5710 // Don't use SVE for types that don't fit.
5711 if (VT.getFixedSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
5714 // TODO: Perhaps an artificial restriction, but worth having whilst getting
5715 // the base fixed length SVE support in place.
5716 if (!VT.isPow2VectorType())
5722 //===----------------------------------------------------------------------===//
5723 // Calling Convention Implementation
5724 //===----------------------------------------------------------------------===//
5726 static unsigned getIntrinsicID(const SDNode *N) {
5727 unsigned Opcode = N->getOpcode();
5730 return Intrinsic::not_intrinsic;
5731 case ISD::INTRINSIC_WO_CHAIN: {
5732 unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
5733 if (IID < Intrinsic::num_intrinsics)
5735 return Intrinsic::not_intrinsic;
5740 bool AArch64TargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
5742 if (!N0.hasOneUse())
5745 unsigned IID = getIntrinsicID(N1.getNode());
5746 // Avoid reassociating expressions that can be lowered to smlal/umlal.
5747 if (IID == Intrinsic::aarch64_neon_umull ||
5748 N1.getOpcode() == AArch64ISD::UMULL ||
5749 IID == Intrinsic::aarch64_neon_smull ||
5750 N1.getOpcode() == AArch64ISD::SMULL)
5751 return N0.getOpcode() != ISD::ADD;
5756 /// Selects the correct CCAssignFn for a given CallingConvention value.
5757 CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
5758 bool IsVarArg) const {
5761 report_fatal_error("Unsupported calling convention.");
5762 case CallingConv::WebKit_JS:
5763 return CC_AArch64_WebKit_JS;
5764 case CallingConv::GHC:
5765 return CC_AArch64_GHC;
5766 case CallingConv::C:
5767 case CallingConv::Fast:
5768 case CallingConv::PreserveMost:
5769 case CallingConv::CXX_FAST_TLS:
5770 case CallingConv::Swift:
5771 case CallingConv::SwiftTail:
5772 case CallingConv::Tail:
5773 if (Subtarget->isTargetWindows() && IsVarArg)
5774 return CC_AArch64_Win64_VarArg;
5775 if (!Subtarget->isTargetDarwin())
5776 return CC_AArch64_AAPCS;
5778 return CC_AArch64_DarwinPCS;
5779 return Subtarget->isTargetILP32() ? CC_AArch64_DarwinPCS_ILP32_VarArg
5780 : CC_AArch64_DarwinPCS_VarArg;
5781 case CallingConv::Win64:
5782 return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
5783 case CallingConv::CFGuard_Check:
5784 return CC_AArch64_Win64_CFGuard_Check;
5785 case CallingConv::AArch64_VectorCall:
5786 case CallingConv::AArch64_SVE_VectorCall:
5787 return CC_AArch64_AAPCS;
5792 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
5793 return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
5794 : RetCC_AArch64_AAPCS;
5797 SDValue AArch64TargetLowering::LowerFormalArguments(
5798 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
5799 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
5800 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
5801 MachineFunction &MF = DAG.getMachineFunction();
5802 const Function &F = MF.getFunction();
5803 MachineFrameInfo &MFI = MF.getFrameInfo();
5804 bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
5805 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
5807 SmallVector<ISD::OutputArg, 4> Outs;
5808 GetReturnInfo(CallConv, F.getReturnType(), F.getAttributes(), Outs,
5809 DAG.getTargetLoweringInfo(), MF.getDataLayout());
5810 if (any_of(Outs, [](ISD::OutputArg &Out){ return Out.VT.isScalableVector(); }))
5811 FuncInfo->setIsSVECC(true);
5813 // Assign locations to all of the incoming arguments.
5814 SmallVector<CCValAssign, 16> ArgLocs;
5815 DenseMap<unsigned, SDValue> CopiedRegs;
5816 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
5818 // At this point, Ins[].VT may already be promoted to i32. To correctly
5819 // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
5820 // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
5821 // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
5822 // we use a special version of AnalyzeFormalArguments to pass in ValVT and
5824 unsigned NumArgs = Ins.size();
5825 Function::const_arg_iterator CurOrigArg = F.arg_begin();
5826 unsigned CurArgIdx = 0;
5827 for (unsigned i = 0; i != NumArgs; ++i) {
5828 MVT ValVT = Ins[i].VT;
5829 if (Ins[i].isOrigArg()) {
5830 std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
5831 CurArgIdx = Ins[i].getOrigArgIndex();
5833 // Get type of the original argument.
5834 EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
5835 /*AllowUnknown*/ true);
5836 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
5837 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
5838 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
5840 else if (ActualMVT == MVT::i16)
5843 bool UseVarArgCC = false;
5845 UseVarArgCC = isVarArg;
5846 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, UseVarArgCC);
5848 AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
5849 assert(!Res && "Call operand has unhandled type");
5852 SmallVector<SDValue, 16> ArgValues;
5853 unsigned ExtraArgLocs = 0;
5854 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
5855 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
5857 if (Ins[i].Flags.isByVal()) {
5858 // Byval is used for HFAs in the PCS, but the system should work in a
5859 // non-compliant manner for larger structs.
5860 EVT PtrVT = getPointerTy(DAG.getDataLayout());
5861 int Size = Ins[i].Flags.getByValSize();
5862 unsigned NumRegs = (Size + 7) / 8;
5864 // FIXME: This works on big-endian for composite byvals, which are the common
5865 // case. It should also work for fundamental types too.
5867 MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
5868 SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
5869 InVals.push_back(FrameIdxN);
5874 if (Ins[i].Flags.isSwiftAsync())
5875 MF.getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5878 if (VA.isRegLoc()) {
5879 // Arguments stored in registers.
5880 EVT RegVT = VA.getLocVT();
5881 const TargetRegisterClass *RC;
5883 if (RegVT == MVT::i32)
5884 RC = &AArch64::GPR32RegClass;
5885 else if (RegVT == MVT::i64)
5886 RC = &AArch64::GPR64RegClass;
5887 else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
5888 RC = &AArch64::FPR16RegClass;
5889 else if (RegVT == MVT::f32)
5890 RC = &AArch64::FPR32RegClass;
5891 else if (RegVT == MVT::f64 || RegVT.is64BitVector())
5892 RC = &AArch64::FPR64RegClass;
5893 else if (RegVT == MVT::f128 || RegVT.is128BitVector())
5894 RC = &AArch64::FPR128RegClass;
5895 else if (RegVT.isScalableVector() &&
5896 RegVT.getVectorElementType() == MVT::i1) {
5897 FuncInfo->setIsSVECC(true);
5898 RC = &AArch64::PPRRegClass;
5899 } else if (RegVT.isScalableVector()) {
5900 FuncInfo->setIsSVECC(true);
5901 RC = &AArch64::ZPRRegClass;
5903 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
5905 // Transform the arguments in physical registers into virtual ones.
5906 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
5907 ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
5909 // If this is an 8, 16 or 32-bit value, it is really passed promoted
5910 // to 64 bits. Insert an assert[sz]ext to capture this, then
5911 // truncate to the right size.
5912 switch (VA.getLocInfo()) {
5914 llvm_unreachable("Unknown loc info!");
5915 case CCValAssign::Full:
5917 case CCValAssign::Indirect:
5918 assert(VA.getValVT().isScalableVector() &&
5919 "Only scalable vectors can be passed indirectly");
5921 case CCValAssign::BCvt:
5922 ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
5924 case CCValAssign::AExt:
5925 case CCValAssign::SExt:
5926 case CCValAssign::ZExt:
5928 case CCValAssign::AExtUpper:
5929 ArgValue = DAG.getNode(ISD::SRL, DL, RegVT, ArgValue,
5930 DAG.getConstant(32, DL, RegVT));
5931 ArgValue = DAG.getZExtOrTrunc(ArgValue, DL, VA.getValVT());
5934 } else { // VA.isRegLoc()
5935 assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
5936 unsigned ArgOffset = VA.getLocMemOffset();
5937 unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
5938 ? VA.getLocVT().getSizeInBits()
5939 : VA.getValVT().getSizeInBits()) / 8;
5941 uint32_t BEAlign = 0;
5942 if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
5943 !Ins[i].Flags.isInConsecutiveRegs())
5944 BEAlign = 8 - ArgSize;
5946 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
5948 // Create load nodes to retrieve arguments from the stack.
5949 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
5951 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
5952 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
5953 MVT MemVT = VA.getValVT();
5955 switch (VA.getLocInfo()) {
5958 case CCValAssign::Trunc:
5959 case CCValAssign::BCvt:
5960 MemVT = VA.getLocVT();
5962 case CCValAssign::Indirect:
5963 assert(VA.getValVT().isScalableVector() &&
5964 "Only scalable vectors can be passed indirectly");
5965 MemVT = VA.getLocVT();
5967 case CCValAssign::SExt:
5968 ExtType = ISD::SEXTLOAD;
5970 case CCValAssign::ZExt:
5971 ExtType = ISD::ZEXTLOAD;
5973 case CCValAssign::AExt:
5974 ExtType = ISD::EXTLOAD;
5979 DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN,
5980 MachinePointerInfo::getFixedStack(MF, FI), MemVT);
5983 if (VA.getLocInfo() == CCValAssign::Indirect) {
5984 assert(VA.getValVT().isScalableVector() &&
5985 "Only scalable vectors can be passed indirectly");
5987 uint64_t PartSize = VA.getValVT().getStoreSize().getKnownMinSize();
5988 unsigned NumParts = 1;
5989 if (Ins[i].Flags.isInConsecutiveRegs()) {
5990 assert(!Ins[i].Flags.isInConsecutiveRegsLast());
5991 while (!Ins[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
5995 MVT PartLoad = VA.getValVT();
5996 SDValue Ptr = ArgValue;
5998 // Ensure we generate all loads for each tuple part, whilst updating the
5999 // pointer after each load correctly using vscale.
6000 while (NumParts > 0) {
6001 ArgValue = DAG.getLoad(PartLoad, DL, Chain, Ptr, MachinePointerInfo());
6002 InVals.push_back(ArgValue);
6005 SDValue BytesIncrement = DAG.getVScale(
6006 DL, Ptr.getValueType(),
6007 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
6009 Flags.setNoUnsignedWrap(true);
6010 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6011 BytesIncrement, Flags);
6017 if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
6018 ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
6019 ArgValue, DAG.getValueType(MVT::i32));
6021 // i1 arguments are zero-extended to i8 by the caller. Emit a
6022 // hint to reflect this.
6023 if (Ins[i].isOrigArg()) {
6024 Argument *OrigArg = F.getArg(Ins[i].getOrigArgIndex());
6025 if (OrigArg->getType()->isIntegerTy(1)) {
6026 if (!Ins[i].Flags.isZExt()) {
6027 ArgValue = DAG.getNode(AArch64ISD::ASSERT_ZEXT_BOOL, DL,
6028 ArgValue.getValueType(), ArgValue);
6033 InVals.push_back(ArgValue);
6036 assert((ArgLocs.size() + ExtraArgLocs) == Ins.size());
6040 if (!Subtarget->isTargetDarwin() || IsWin64) {
6041 // The AAPCS variadic function ABI is identical to the non-variadic
6042 // one. As a result there may be more arguments in registers and we should
6043 // save them for future reference.
6044 // Win64 variadic functions also pass arguments in registers, but all float
6045 // arguments are passed in integer registers.
6046 saveVarArgRegisters(CCInfo, DAG, DL, Chain);
6049 // This will point to the next argument passed via stack.
6050 unsigned StackOffset = CCInfo.getNextStackOffset();
6051 // We currently pass all varargs at 8-byte alignment, or 4 for ILP32
6052 StackOffset = alignTo(StackOffset, Subtarget->isTargetILP32() ? 4 : 8);
6053 FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
6055 if (MFI.hasMustTailInVarArgFunc()) {
6056 SmallVector<MVT, 2> RegParmTypes;
6057 RegParmTypes.push_back(MVT::i64);
6058 RegParmTypes.push_back(MVT::f128);
6059 // Compute the set of forwarded registers. The rest are scratch.
6060 SmallVectorImpl<ForwardedRegister> &Forwards =
6061 FuncInfo->getForwardedMustTailRegParms();
6062 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes,
6065 // Conservatively forward X8, since it might be used for aggregate return.
6066 if (!CCInfo.isAllocated(AArch64::X8)) {
6067 Register X8VReg = MF.addLiveIn(AArch64::X8, &AArch64::GPR64RegClass);
6068 Forwards.push_back(ForwardedRegister(X8VReg, AArch64::X8, MVT::i64));
6073 // On Windows, InReg pointers must be returned, so record the pointer in a
6074 // virtual register at the start of the function so it can be returned in the
6077 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
6078 if (Ins[I].Flags.isInReg()) {
6079 assert(!FuncInfo->getSRetReturnReg());
6081 MVT PtrTy = getPointerTy(DAG.getDataLayout());
6083 MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
6084 FuncInfo->setSRetReturnReg(Reg);
6086 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[I]);
6087 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
6093 unsigned StackArgSize = CCInfo.getNextStackOffset();
6094 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
6095 if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
6096 // This is a non-standard ABI so by fiat I say we're allowed to make full
6097 // use of the stack area to be popped, which must be aligned to 16 bytes in
6099 StackArgSize = alignTo(StackArgSize, 16);
6101 // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
6102 // a multiple of 16.
6103 FuncInfo->setArgumentStackToRestore(StackArgSize);
6105 // This realignment carries over to the available bytes below. Our own
6106 // callers will guarantee the space is free by giving an aligned value to
6109 // Even if we're not expected to free up the space, it's useful to know how
6110 // much is there while considering tail calls (because we can reuse it).
6111 FuncInfo->setBytesInStackArgArea(StackArgSize);
6113 if (Subtarget->hasCustomCallingConv())
6114 Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
6119 void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
6122 SDValue &Chain) const {
6123 MachineFunction &MF = DAG.getMachineFunction();
6124 MachineFrameInfo &MFI = MF.getFrameInfo();
6125 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6126 auto PtrVT = getPointerTy(DAG.getDataLayout());
6127 bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv());
6129 SmallVector<SDValue, 8> MemOps;
6131 static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
6132 AArch64::X3, AArch64::X4, AArch64::X5,
6133 AArch64::X6, AArch64::X7 };
6134 static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
6135 unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);
6137 unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
6139 if (GPRSaveSize != 0) {
6141 GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
6142 if (GPRSaveSize & 15)
6143 // The extra size here, if triggered, will always be 8.
6144 MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
6146 GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
6148 SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
6150 for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
6151 Register VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
6152 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
6154 DAG.getStore(Val.getValue(1), DL, Val, FIN,
6155 IsWin64 ? MachinePointerInfo::getFixedStack(
6156 MF, GPRIdx, (i - FirstVariadicGPR) * 8)
6157 : MachinePointerInfo::getStack(MF, i * 8));
6158 MemOps.push_back(Store);
6160 DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
6163 FuncInfo->setVarArgsGPRIndex(GPRIdx);
6164 FuncInfo->setVarArgsGPRSize(GPRSaveSize);
6166 if (Subtarget->hasFPARMv8() && !IsWin64) {
6167 static const MCPhysReg FPRArgRegs[] = {
6168 AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
6169 AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
6170 static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
6171 unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);
6173 unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
6175 if (FPRSaveSize != 0) {
6176 FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
6178 SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
6180 for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
6181 Register VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
6182 SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
6184 SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
6185 MachinePointerInfo::getStack(MF, i * 16));
6186 MemOps.push_back(Store);
6187 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
6188 DAG.getConstant(16, DL, PtrVT));
6191 FuncInfo->setVarArgsFPRIndex(FPRIdx);
6192 FuncInfo->setVarArgsFPRSize(FPRSaveSize);
6195 if (!MemOps.empty()) {
6196 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
6200 /// LowerCallResult - Lower the result values of a call into the
6201 /// appropriate copies out of appropriate physical registers.
6202 SDValue AArch64TargetLowering::LowerCallResult(
6203 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
6204 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
6205 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
6206 SDValue ThisVal) const {
6207 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
6208 // Assign locations to each value returned by this call.
6209 SmallVector<CCValAssign, 16> RVLocs;
6210 DenseMap<unsigned, SDValue> CopiedRegs;
6211 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
6213 CCInfo.AnalyzeCallResult(Ins, RetCC);
6215 // Copy all of the result registers out of their specified physreg.
6216 for (unsigned i = 0; i != RVLocs.size(); ++i) {
6217 CCValAssign VA = RVLocs[i];
6219 // Pass 'this' value directly from the argument to return value, to avoid
6220 // reg unit interference
6221 if (i == 0 && isThisReturn) {
6222 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
6223 "unexpected return calling convention register assignment");
6224 InVals.push_back(ThisVal);
6228 // Avoid copying a physreg twice since RegAllocFast is incompetent and only
6229 // allows one use of a physreg per block.
6230 SDValue Val = CopiedRegs.lookup(VA.getLocReg());
6233 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
6234 Chain = Val.getValue(1);
6235 InFlag = Val.getValue(2);
6236 CopiedRegs[VA.getLocReg()] = Val;
6239 switch (VA.getLocInfo()) {
6241 llvm_unreachable("Unknown loc info!");
6242 case CCValAssign::Full:
6244 case CCValAssign::BCvt:
6245 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
6247 case CCValAssign::AExtUpper:
6248 Val = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Val,
6249 DAG.getConstant(32, DL, VA.getLocVT()));
6251 case CCValAssign::AExt:
6253 case CCValAssign::ZExt:
6254 Val = DAG.getZExtOrTrunc(Val, DL, VA.getValVT());
6258 InVals.push_back(Val);
6264 /// Return true if the calling convention is one that we can guarantee TCO for.
6265 static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
6266 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
6267 CC == CallingConv::Tail || CC == CallingConv::SwiftTail;
6270 /// Return true if we might ever do TCO for calls with this calling convention.
6271 static bool mayTailCallThisCC(CallingConv::ID CC) {
6273 case CallingConv::C:
6274 case CallingConv::AArch64_SVE_VectorCall:
6275 case CallingConv::PreserveMost:
6276 case CallingConv::Swift:
6277 case CallingConv::SwiftTail:
6278 case CallingConv::Tail:
6279 case CallingConv::Fast:
6286 static void analyzeCallOperands(const AArch64TargetLowering &TLI,
6287 const AArch64Subtarget *Subtarget,
6288 const TargetLowering::CallLoweringInfo &CLI,
6290 const SelectionDAG &DAG = CLI.DAG;
6291 CallingConv::ID CalleeCC = CLI.CallConv;
6292 bool IsVarArg = CLI.IsVarArg;
6293 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6294 bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
6296 unsigned NumArgs = Outs.size();
6297 for (unsigned i = 0; i != NumArgs; ++i) {
6298 MVT ArgVT = Outs[i].VT;
6299 ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
6301 bool UseVarArgCC = false;
6303 // On Windows, the fixed arguments in a vararg call are passed in GPRs
6304 // too, so use the vararg CC to force them to integer registers.
6305 if (IsCalleeWin64) {
6308 UseVarArgCC = !Outs[i].IsFixed;
6311 // Get type of the original argument.
6313 TLI.getValueType(DAG.getDataLayout(), CLI.Args[Outs[i].OrigArgIndex].Ty,
6314 /*AllowUnknown*/ true);
6315 MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ArgVT;
6316 // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
6317 if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
6319 else if (ActualMVT == MVT::i16)
6323 CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CalleeCC, UseVarArgCC);
6324 bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
6325 assert(!Res && "Call operand has unhandled type");
6330 bool AArch64TargetLowering::isEligibleForTailCallOptimization(
6331 const CallLoweringInfo &CLI) const {
6332 CallingConv::ID CalleeCC = CLI.CallConv;
6333 if (!mayTailCallThisCC(CalleeCC))
6336 SDValue Callee = CLI.Callee;
6337 bool IsVarArg = CLI.IsVarArg;
6338 const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6339 const SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
6340 const SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
6341 const SelectionDAG &DAG = CLI.DAG;
6342 MachineFunction &MF = DAG.getMachineFunction();
6343 const Function &CallerF = MF.getFunction();
6344 CallingConv::ID CallerCC = CallerF.getCallingConv();
6346 // Functions using the C or Fast calling convention that have an SVE signature
6347 // preserve more registers and should assume the SVE_VectorCall CC.
6348 // The check for matching callee-saved regs will determine whether it is
6349 // eligible for TCO.
6350 if ((CallerCC == CallingConv::C || CallerCC == CallingConv::Fast) &&
6351 MF.getInfo<AArch64FunctionInfo>()->isSVECC())
6352 CallerCC = CallingConv::AArch64_SVE_VectorCall;
6354 bool CCMatch = CallerCC == CalleeCC;
6356 // When using the Windows calling convention on a non-windows OS, we want
6357 // to back up and restore X18 in such functions; we can't do a tail call
6358 // from those functions.
6359 if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
6360 CalleeCC != CallingConv::Win64)
6363 // Byval parameters hand the function a pointer directly into the stack area
6364 // we want to reuse during a tail call. Working around this *is* possible (see
6365 // X86) but less efficient and uglier in LowerCall.
6366 for (Function::const_arg_iterator i = CallerF.arg_begin(),
6367 e = CallerF.arg_end();
6369 if (i->hasByValAttr())
6372 // On Windows, "inreg" attributes signify non-aggregate indirect returns.
6373 // In this case, it is necessary to save/restore X0 in the callee. Tail
6374 // call opt interferes with this. So we disable tail call opt when the
6375 // caller has an argument with "inreg" attribute.
6377 // FIXME: Check whether the callee also has an "inreg" argument.
6378 if (i->hasInRegAttr())
6382 if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
6385 // Externally-defined functions with weak linkage should not be
6386 // tail-called on AArch64 when the OS does not support dynamic
6387 // pre-emption of symbols, as the AAELF spec requires normal calls
6388 // to undefined weak functions to be replaced with a NOP or jump to the
6389 // next instruction. The behaviour of branch instructions in this
6390 // situation (as used for tail calls) is implementation-defined, so we
6391 // cannot rely on the linker replacing the tail call with a return.
6392 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6393 const GlobalValue *GV = G->getGlobal();
6394 const Triple &TT = getTargetMachine().getTargetTriple();
6395 if (GV->hasExternalWeakLinkage() &&
6396 (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
6400 // Now we search for cases where we can use a tail call without changing the
6401 // ABI. Sibcall is used in some places (particularly gcc) to refer to this
6404 // I want anyone implementing a new calling convention to think long and hard
6405 // about this assert.
6406 assert((!IsVarArg || CalleeCC == CallingConv::C) &&
6407 "Unexpected variadic calling convention");
6409 LLVMContext &C = *DAG.getContext();
6410 // Check that the call results are passed in the same way.
6411 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
6412 CCAssignFnForCall(CalleeCC, IsVarArg),
6413 CCAssignFnForCall(CallerCC, IsVarArg)))
6415 // The callee has to preserve all registers the caller needs to preserve.
6416 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6417 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
6419 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
6420 if (Subtarget->hasCustomCallingConv()) {
6421 TRI->UpdateCustomCallPreservedMask(MF, &CallerPreserved);
6422 TRI->UpdateCustomCallPreservedMask(MF, &CalleePreserved);
6424 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
6428 // Nothing more to check if the callee is taking no arguments
6432 SmallVector<CCValAssign, 16> ArgLocs;
6433 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, C);
6435 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
6437 if (IsVarArg && !(CLI.CB && CLI.CB->isMustTailCall())) {
6438 // When we are musttail, additional checks have been done and we can safely ignore this check
6439 // At least two cases here: if caller is fastcc then we can't have any
6440 // memory arguments (we'd be expected to clean up the stack afterwards). If
6441 // caller is C then we could potentially use its argument area.
6443 // FIXME: for now we take the most conservative of these in both cases:
6444 // disallow all variadic memory operands.
6445 for (const CCValAssign &ArgLoc : ArgLocs)
6446 if (!ArgLoc.isRegLoc())
6450 const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6452 // If any of the arguments is passed indirectly, it must be SVE, so the
6453 // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
6454 // allocate space on the stack. That is why we determine this explicitly here
6455 // the call cannot be a tailcall.
6456 if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
6457 assert((A.getLocInfo() != CCValAssign::Indirect ||
6458 A.getValVT().isScalableVector()) &&
6459 "Expected value to be scalable");
6460 return A.getLocInfo() == CCValAssign::Indirect;
6464 // If the stack arguments for this call do not fit into our own save area then
6465 // the call cannot be made tail.
6466 if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
6469 const MachineRegisterInfo &MRI = MF.getRegInfo();
6470 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
6476 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
6478 MachineFrameInfo &MFI,
6479 int ClobberedFI) const {
6480 SmallVector<SDValue, 8> ArgChains;
6481 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
6482 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
6484 // Include the original chain at the beginning of the list. When this is
6485 // used by target LowerCall hooks, this helps legalize find the
6486 // CALLSEQ_BEGIN node.
6487 ArgChains.push_back(Chain);
6489 // Add a chain value for each stack argument corresponding
6490 for (SDNode *U : DAG.getEntryNode().getNode()->uses())
6491 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U))
6492 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
6493 if (FI->getIndex() < 0) {
6494 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
6495 int64_t InLastByte = InFirstByte;
6496 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
6498 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
6499 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
6500 ArgChains.push_back(SDValue(L, 1));
6503 // Build a tokenfactor for all the chains.
6504 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
6507 bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
6508 bool TailCallOpt) const {
6509 return (CallCC == CallingConv::Fast && TailCallOpt) ||
6510 CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
6513 // Check if the value is zero-extended from i1 to i8
6514 static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
6515 unsigned SizeInBits = Arg.getValueType().getSizeInBits();
6519 APInt RequredZero(SizeInBits, 0xFE);
6520 KnownBits Bits = DAG.computeKnownBits(Arg, 4);
6521 bool ZExtBool = (Bits.Zero & RequredZero) == RequredZero;
6525 /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
6526 /// and add input and output parameter nodes.
6528 AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
6529 SmallVectorImpl<SDValue> &InVals) const {
6530 SelectionDAG &DAG = CLI.DAG;
6532 SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
6533 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
6534 SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
6535 SDValue Chain = CLI.Chain;
6536 SDValue Callee = CLI.Callee;
6537 bool &IsTailCall = CLI.IsTailCall;
6538 CallingConv::ID &CallConv = CLI.CallConv;
6539 bool IsVarArg = CLI.IsVarArg;
6541 MachineFunction &MF = DAG.getMachineFunction();
6542 MachineFunction::CallSiteInfo CSInfo;
6543 bool IsThisReturn = false;
6545 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
6546 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
6547 bool IsSibCall = false;
6548 bool GuardWithBTI = false;
6550 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
6551 !Subtarget->noBTIAtReturnTwice()) {
6552 GuardWithBTI = FuncInfo->branchTargetEnforcement();
6555 // Check callee args/returns for SVE registers and set calling convention
6557 if (CallConv == CallingConv::C || CallConv == CallingConv::Fast) {
6558 bool CalleeOutSVE = any_of(Outs, [](ISD::OutputArg &Out){
6559 return Out.VT.isScalableVector();
6561 bool CalleeInSVE = any_of(Ins, [](ISD::InputArg &In){
6562 return In.VT.isScalableVector();
6565 if (CalleeInSVE || CalleeOutSVE)
6566 CallConv = CallingConv::AArch64_SVE_VectorCall;
6570 // Check if it's really possible to do a tail call.
6571 IsTailCall = isEligibleForTailCallOptimization(CLI);
6573 // A sibling call is one where we're under the usual C ABI and not planning
6574 // to change that but can still do a tail call:
6575 if (!TailCallOpt && IsTailCall && CallConv != CallingConv::Tail &&
6576 CallConv != CallingConv::SwiftTail)
6583 if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
6584 report_fatal_error("failed to perform tail call elimination on a call "
6585 "site marked musttail");
6587 // Analyze operands of the call, assigning locations to each operand.
6588 SmallVector<CCValAssign, 16> ArgLocs;
6589 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
6592 unsigned NumArgs = Outs.size();
6594 for (unsigned i = 0; i != NumArgs; ++i) {
6595 if (!Outs[i].IsFixed && Outs[i].VT.isScalableVector())
6596 report_fatal_error("Passing SVE types to variadic functions is "
6597 "currently not supported");
6601 analyzeCallOperands(*this, Subtarget, CLI, CCInfo);
6603 // Get a count of how many bytes are to be pushed on the stack.
6604 unsigned NumBytes = CCInfo.getNextStackOffset();
6607 // Since we're not changing the ABI to make this a tail call, the memory
6608 // operands are already available in the caller's incoming argument space.
6612 // FPDiff is the byte offset of the call's argument area from the callee's.
6613 // Stores to callee stack arguments will be placed in FixedStackSlots offset
6614 // by this amount for a tail call. In a sibling call it must be 0 because the
6615 // caller will deallocate the entire stack and the callee still expects its
6616 // arguments to begin at SP+0. Completely unused for non-tail calls.
6619 if (IsTailCall && !IsSibCall) {
6620 unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
6622 // Since callee will pop argument stack as a tail call, we must keep the
6623 // popped size 16-byte aligned.
6624 NumBytes = alignTo(NumBytes, 16);
6626 // FPDiff will be negative if this tail call requires more space than we
6627 // would automatically have in our incoming argument space. Positive if we
6628 // can actually shrink the stack.
6629 FPDiff = NumReusableBytes - NumBytes;
6631 // Update the required reserved area if this is the tail call requiring the
6632 // most argument stack space.
6633 if (FPDiff < 0 && FuncInfo->getTailCallReservedStack() < (unsigned)-FPDiff)
6634 FuncInfo->setTailCallReservedStack(-FPDiff);
6636 // The stack pointer must be 16-byte aligned at all times it's used for a
6637 // memory operation, which in practice means at *all* times and in
6638 // particular across call boundaries. Therefore our own arguments started at
6639 // a 16-byte aligned SP and the delta applied for the tail call should
6640 // satisfy the same constraint.
6641 assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
6644 // Adjust the stack pointer for the new arguments...
6645 // These operations are automatically eliminated by the prolog/epilog pass
6647 Chain = DAG.getCALLSEQ_START(Chain, IsTailCall ? 0 : NumBytes, 0, DL);
6649 SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
6650 getPointerTy(DAG.getDataLayout()));
6652 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
6653 SmallSet<unsigned, 8> RegsUsed;
6654 SmallVector<SDValue, 8> MemOpChains;
6655 auto PtrVT = getPointerTy(DAG.getDataLayout());
6657 if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
6658 const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
6659 for (const auto &F : Forwards) {
6660 SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
6661 RegsToPass.emplace_back(F.PReg, Val);
6665 // Walk the register/memloc assignments, inserting copies/loads.
6666 unsigned ExtraArgLocs = 0;
6667 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
6668 CCValAssign &VA = ArgLocs[i - ExtraArgLocs];
6669 SDValue Arg = OutVals[i];
6670 ISD::ArgFlagsTy Flags = Outs[i].Flags;
6672 // Promote the value if needed.
6673 switch (VA.getLocInfo()) {
6675 llvm_unreachable("Unknown loc info!");
6676 case CCValAssign::Full:
6678 case CCValAssign::SExt:
6679 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
6681 case CCValAssign::ZExt:
6682 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
6684 case CCValAssign::AExt:
6685 if (Outs[i].ArgVT == MVT::i1) {
6686 // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
6688 // Check if we actually have to do this, because the value may
6689 // already be zero-extended.
6691 // We cannot just emit a (zext i8 (trunc (assert-zext i8)))
6692 // and rely on DAGCombiner to fold this, because the following
6693 // (anyext i32) is combined with (zext i8) in DAG.getNode:
6695 // (ext (zext x)) -> (zext x)
6697 // This will give us (zext i32), which we cannot remove, so
6698 // try to check this beforehand.
6699 if (!checkZExtBool(Arg, DAG)) {
6700 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
6701 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
6704 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
6706 case CCValAssign::AExtUpper:
6707 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
6708 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
6709 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
6710 DAG.getConstant(32, DL, VA.getLocVT()));
6712 case CCValAssign::BCvt:
6713 Arg = DAG.getBitcast(VA.getLocVT(), Arg);
6715 case CCValAssign::Trunc:
6716 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
6718 case CCValAssign::FPExt:
6719 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
6721 case CCValAssign::Indirect:
6722 assert(VA.getValVT().isScalableVector() &&
6723 "Only scalable vectors can be passed indirectly");
6725 uint64_t StoreSize = VA.getValVT().getStoreSize().getKnownMinSize();
6726 uint64_t PartSize = StoreSize;
6727 unsigned NumParts = 1;
6728 if (Outs[i].Flags.isInConsecutiveRegs()) {
6729 assert(!Outs[i].Flags.isInConsecutiveRegsLast());
6730 while (!Outs[i + NumParts - 1].Flags.isInConsecutiveRegsLast())
6732 StoreSize *= NumParts;
6735 MachineFrameInfo &MFI = MF.getFrameInfo();
6736 Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
6737 Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
6738 int FI = MFI.CreateStackObject(StoreSize, Alignment, false);
6739 MFI.setStackID(FI, TargetStackID::ScalableVector);
6741 MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, FI);
6742 SDValue Ptr = DAG.getFrameIndex(
6743 FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
6744 SDValue SpillSlot = Ptr;
6746 // Ensure we generate all stores for each tuple part, whilst updating the
6747 // pointer after each store correctly using vscale.
6749 Chain = DAG.getStore(Chain, DL, OutVals[i], Ptr, MPI);
6752 SDValue BytesIncrement = DAG.getVScale(
6753 DL, Ptr.getValueType(),
6754 APInt(Ptr.getValueSizeInBits().getFixedSize(), PartSize));
6756 Flags.setNoUnsignedWrap(true);
6758 MPI = MachinePointerInfo(MPI.getAddrSpace());
6759 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6760 BytesIncrement, Flags);
6770 if (VA.isRegLoc()) {
6771 if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
6772 Outs[0].VT == MVT::i64) {
6773 assert(VA.getLocVT() == MVT::i64 &&
6774 "unexpected calling convention register assignment");
6775 assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
6776 "unexpected use of 'returned'");
6777 IsThisReturn = true;
6779 if (RegsUsed.count(VA.getLocReg())) {
6780 // If this register has already been used then we're trying to pack
6781 // parts of an [N x i32] into an X-register. The extension type will
6782 // take care of putting the two halves in the right place but we have to
6785 llvm::find_if(RegsToPass,
6786 [=](const std::pair<unsigned, SDValue> &Elt) {
6787 return Elt.first == VA.getLocReg();
6790 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
6791 // Call site info is used for function's parameter entry value
6792 // tracking. For now we track only simple cases when parameter
6793 // is transferred through whole register.
6794 llvm::erase_if(CSInfo, [&VA](MachineFunction::ArgRegPair ArgReg) {
6795 return ArgReg.Reg == VA.getLocReg();
6798 RegsToPass.emplace_back(VA.getLocReg(), Arg);
6799 RegsUsed.insert(VA.getLocReg());
6800 const TargetOptions &Options = DAG.getTarget().Options;
6801 if (Options.EmitCallSiteInfo)
6802 CSInfo.emplace_back(VA.getLocReg(), i);
6805 assert(VA.isMemLoc());
6808 MachinePointerInfo DstInfo;
6810 // FIXME: This works on big-endian for composite byvals, which are the
6811 // common case. It should also work for fundamental types too.
6812 uint32_t BEAlign = 0;
6814 if (VA.getLocInfo() == CCValAssign::Indirect ||
6815 VA.getLocInfo() == CCValAssign::Trunc)
6816 OpSize = VA.getLocVT().getFixedSizeInBits();
6818 OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
6819 : VA.getValVT().getSizeInBits();
6820 OpSize = (OpSize + 7) / 8;
6821 if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
6822 !Flags.isInConsecutiveRegs()) {
6824 BEAlign = 8 - OpSize;
6826 unsigned LocMemOffset = VA.getLocMemOffset();
6827 int32_t Offset = LocMemOffset + BEAlign;
6828 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
6829 PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6832 Offset = Offset + FPDiff;
6833 int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
6835 DstAddr = DAG.getFrameIndex(FI, PtrVT);
6836 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
6838 // Make sure any stack arguments overlapping with where we're storing
6839 // are loaded before this eventual operation. Otherwise they'll be
6841 Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
6843 SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
6845 DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
6846 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
6849 if (Outs[i].Flags.isByVal()) {
6851 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
6852 SDValue Cpy = DAG.getMemcpy(
6853 Chain, DL, DstAddr, Arg, SizeNode,
6854 Outs[i].Flags.getNonZeroByValAlign(),
6855 /*isVol = */ false, /*AlwaysInline = */ false,
6856 /*isTailCall = */ false, DstInfo, MachinePointerInfo());
6858 MemOpChains.push_back(Cpy);
6860 // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
6861 // promoted to a legal register type i32, we should truncate Arg back to
6863 if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 ||
6864 VA.getValVT() == MVT::i16)
6865 Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
6867 SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
6868 MemOpChains.push_back(Store);
6873 if (!MemOpChains.empty())
6874 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
6876 // Build a sequence of copy-to-reg nodes chained together with token chain
6877 // and flag operands which copy the outgoing args into the appropriate regs.
6879 for (auto &RegToPass : RegsToPass) {
6880 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
6881 RegToPass.second, InFlag);
6882 InFlag = Chain.getValue(1);
6885 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
6886 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
6887 // node so that legalize doesn't hack it.
6888 if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
6889 auto GV = G->getGlobal();
6891 Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
6892 if (OpFlags & AArch64II::MO_GOT) {
6893 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
6894 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
6896 const GlobalValue *GV = G->getGlobal();
6897 Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
6899 } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
6900 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
6901 Subtarget->isTargetMachO()) {
6902 const char *Sym = S->getSymbol();
6903 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
6904 Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
6906 const char *Sym = S->getSymbol();
6907 Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
6911 // We don't usually want to end the call-sequence here because we would tidy
6912 // the frame up *after* the call, however in the ABI-changing tail-call case
6913 // we've carefully laid out the parameters so that when sp is reset they'll be
6914 // in the correct location.
6915 if (IsTailCall && !IsSibCall) {
6916 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
6917 DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
6918 InFlag = Chain.getValue(1);
6921 std::vector<SDValue> Ops;
6922 Ops.push_back(Chain);
6923 Ops.push_back(Callee);
6926 // Each tail call may have to adjust the stack by a different amount, so
6927 // this information must travel along with the operation for eventual
6928 // consumption by emitEpilogue.
6929 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
6932 // Add argument registers to the end of the list so that they are known live
6934 for (auto &RegToPass : RegsToPass)
6935 Ops.push_back(DAG.getRegister(RegToPass.first,
6936 RegToPass.second.getValueType()));
6938 // Add a register mask operand representing the call-preserved registers.
6939 const uint32_t *Mask;
6940 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
6942 // For 'this' returns, use the X0-preserving mask if applicable
6943 Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
6945 IsThisReturn = false;
6946 Mask = TRI->getCallPreservedMask(MF, CallConv);
6949 Mask = TRI->getCallPreservedMask(MF, CallConv);
6951 if (Subtarget->hasCustomCallingConv())
6952 TRI->UpdateCustomCallPreservedMask(MF, &Mask);
6954 if (TRI->isAnyArgRegReserved(MF))
6955 TRI->emitReservedArgRegCallError(MF);
6957 assert(Mask && "Missing call preserved mask for calling convention");
6958 Ops.push_back(DAG.getRegisterMask(Mask));
6960 if (InFlag.getNode())
6961 Ops.push_back(InFlag);
6963 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
6965 // If we're doing a tall call, use a TC_RETURN here rather than an
6966 // actual call instruction.
6968 MF.getFrameInfo().setHasTailCall();
6969 SDValue Ret = DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
6970 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
6974 unsigned CallOpc = AArch64ISD::CALL;
6975 // Calls with operand bundle "clang.arc.attachedcall" are special. They should
6976 // be expanded to the call, directly followed by a special marker sequence and
6977 // a call to an ObjC library function. Use CALL_RVMARKER to do that.
6978 if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) {
6979 assert(!IsTailCall &&
6980 "tail calls cannot be marked with clang.arc.attachedcall");
6981 CallOpc = AArch64ISD::CALL_RVMARKER;
6983 // Add a target global address for the retainRV/claimRV runtime function
6984 // just before the call target.
6985 Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
6986 auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
6987 Ops.insert(Ops.begin() + 1, GA);
6988 } else if (GuardWithBTI)
6989 CallOpc = AArch64ISD::CALL_BTI;
6991 // Returns a chain and a flag for retval copy to use.
6992 Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
6993 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
6994 InFlag = Chain.getValue(1);
6995 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
6997 uint64_t CalleePopBytes =
6998 DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;
7000 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
7001 DAG.getIntPtrConstant(CalleePopBytes, DL, true),
7004 InFlag = Chain.getValue(1);
7006 // Handle result values, copying them out of physregs into vregs that we
7008 return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
7009 InVals, IsThisReturn,
7010 IsThisReturn ? OutVals[0] : SDValue());
7013 bool AArch64TargetLowering::CanLowerReturn(
7014 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
7015 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
7016 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7017 SmallVector<CCValAssign, 16> RVLocs;
7018 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
7019 return CCInfo.CheckReturn(Outs, RetCC);
7023 AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
7025 const SmallVectorImpl<ISD::OutputArg> &Outs,
7026 const SmallVectorImpl<SDValue> &OutVals,
7027 const SDLoc &DL, SelectionDAG &DAG) const {
7028 auto &MF = DAG.getMachineFunction();
7029 auto *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
7031 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv);
7032 SmallVector<CCValAssign, 16> RVLocs;
7033 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
7034 CCInfo.AnalyzeReturn(Outs, RetCC);
7036 // Copy the result values into the output registers.
7038 SmallVector<std::pair<unsigned, SDValue>, 4> RetVals;
7039 SmallSet<unsigned, 4> RegsUsed;
7040 for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
7041 ++i, ++realRVLocIdx) {
7042 CCValAssign &VA = RVLocs[i];
7043 assert(VA.isRegLoc() && "Can only return in registers!");
7044 SDValue Arg = OutVals[realRVLocIdx];
7046 switch (VA.getLocInfo()) {
7048 llvm_unreachable("Unknown loc info!");
7049 case CCValAssign::Full:
7050 if (Outs[i].ArgVT == MVT::i1) {
7051 // AAPCS requires i1 to be zero-extended to i8 by the producer of the
7052 // value. This is strictly redundant on Darwin (which uses "zeroext
7053 // i1"), but will be optimised out before ISel.
7054 Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
7055 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
7058 case CCValAssign::BCvt:
7059 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
7061 case CCValAssign::AExt:
7062 case CCValAssign::ZExt:
7063 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7065 case CCValAssign::AExtUpper:
7066 assert(VA.getValVT() == MVT::i32 && "only expect 32 -> 64 upper bits");
7067 Arg = DAG.getZExtOrTrunc(Arg, DL, VA.getLocVT());
7068 Arg = DAG.getNode(ISD::SHL, DL, VA.getLocVT(), Arg,
7069 DAG.getConstant(32, DL, VA.getLocVT()));
7073 if (RegsUsed.count(VA.getLocReg())) {
7075 llvm::find_if(RetVals, [=](const std::pair<unsigned, SDValue> &Elt) {
7076 return Elt.first == VA.getLocReg();
7078 Bits = DAG.getNode(ISD::OR, DL, Bits.getValueType(), Bits, Arg);
7080 RetVals.emplace_back(VA.getLocReg(), Arg);
7081 RegsUsed.insert(VA.getLocReg());
7085 SmallVector<SDValue, 4> RetOps(1, Chain);
7086 for (auto &RetVal : RetVals) {
7087 Chain = DAG.getCopyToReg(Chain, DL, RetVal.first, RetVal.second, Flag);
7088 Flag = Chain.getValue(1);
7090 DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
7093 // Windows AArch64 ABIs require that for returning structs by value we copy
7094 // the sret argument into X0 for the return.
7095 // We saved the argument into a virtual register in the entry block,
7096 // so now we copy the value out and into X0.
7097 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
7098 SDValue Val = DAG.getCopyFromReg(RetOps[0], DL, SRetReg,
7099 getPointerTy(MF.getDataLayout()));
7101 unsigned RetValReg = AArch64::X0;
7102 Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Flag);
7103 Flag = Chain.getValue(1);
7106 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
7109 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7110 const MCPhysReg *I = TRI->getCalleeSavedRegsViaCopy(&MF);
7113 if (AArch64::GPR64RegClass.contains(*I))
7114 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
7115 else if (AArch64::FPR64RegClass.contains(*I))
7116 RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
7118 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
7122 RetOps[0] = Chain; // Update chain.
7124 // Add the flag if we have it.
7126 RetOps.push_back(Flag);
7128 return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
7131 //===----------------------------------------------------------------------===//
7132 // Other Lowering Code
7133 //===----------------------------------------------------------------------===//
7135 SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
7137 unsigned Flag) const {
7138 return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty,
7139 N->getOffset(), Flag);
7142 SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
7144 unsigned Flag) const {
7145 return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
7148 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
7150 unsigned Flag) const {
7151 return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
7152 N->getOffset(), Flag);
7155 SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
7157 unsigned Flag) const {
7158 return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
7162 template <class NodeTy>
7163 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
7164 unsigned Flags) const {
7165 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
7167 EVT Ty = getPointerTy(DAG.getDataLayout());
7168 SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT | Flags);
7169 // FIXME: Once remat is capable of dealing with instructions with register
7170 // operands, expand this into two nodes instead of using a wrapper node.
7171 return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
7174 // (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
7175 template <class NodeTy>
7176 SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG,
7177 unsigned Flags) const {
7178 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
7180 EVT Ty = getPointerTy(DAG.getDataLayout());
7181 const unsigned char MO_NC = AArch64II::MO_NC;
7183 AArch64ISD::WrapperLarge, DL, Ty,
7184 getTargetNode(N, Ty, DAG, AArch64II::MO_G3 | Flags),
7185 getTargetNode(N, Ty, DAG, AArch64II::MO_G2 | MO_NC | Flags),
7186 getTargetNode(N, Ty, DAG, AArch64II::MO_G1 | MO_NC | Flags),
7187 getTargetNode(N, Ty, DAG, AArch64II::MO_G0 | MO_NC | Flags));
7190 // (addlow (adrp %hi(sym)) %lo(sym))
7191 template <class NodeTy>
7192 SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG,
7193 unsigned Flags) const {
7194 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
7196 EVT Ty = getPointerTy(DAG.getDataLayout());
7197 SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE | Flags);
7198 SDValue Lo = getTargetNode(N, Ty, DAG,
7199 AArch64II::MO_PAGEOFF | AArch64II::MO_NC | Flags);
7200 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
7201 return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
7205 template <class NodeTy>
7206 SDValue AArch64TargetLowering::getAddrTiny(NodeTy *N, SelectionDAG &DAG,
7207 unsigned Flags) const {
7208 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::getAddrTiny\n");
7210 EVT Ty = getPointerTy(DAG.getDataLayout());
7211 SDValue Sym = getTargetNode(N, Ty, DAG, Flags);
7212 return DAG.getNode(AArch64ISD::ADR, DL, Ty, Sym);
7215 SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
7216 SelectionDAG &DAG) const {
7217 GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
7218 const GlobalValue *GV = GN->getGlobal();
7219 unsigned OpFlags = Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
7221 if (OpFlags != AArch64II::MO_NO_FLAG)
7222 assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
7223 "unexpected offset in global node");
7225 // This also catches the large code model case for Darwin, and tiny code
7226 // model with got relocations.
7227 if ((OpFlags & AArch64II::MO_GOT) != 0) {
7228 return getGOT(GN, DAG, OpFlags);
7232 if (getTargetMachine().getCodeModel() == CodeModel::Large) {
7233 Result = getAddrLarge(GN, DAG, OpFlags);
7234 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
7235 Result = getAddrTiny(GN, DAG, OpFlags);
7237 Result = getAddr(GN, DAG, OpFlags);
7239 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7241 if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
7242 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
7243 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
7247 /// Convert a TLS address reference into the correct sequence of loads
7248 /// and calls to compute the variable's address (for Darwin, currently) and
7249 /// return an SDValue containing the final node.
7251 /// Darwin only has one TLS scheme which must be capable of dealing with the
7252 /// fully general situation, in the worst case. This means:
7253 /// + "extern __thread" declaration.
7254 /// + Defined in a possibly unknown dynamic library.
7256 /// The general system is that each __thread variable has a [3 x i64] descriptor
7257 /// which contains information used by the runtime to calculate the address. The
7258 /// only part of this the compiler needs to know about is the first xword, which
7259 /// contains a function pointer that must be called with the address of the
7260 /// entire descriptor in "x0".
7262 /// Since this descriptor may be in a different unit, in general even the
7263 /// descriptor must be accessed via an indirect load. The "ideal" code sequence
7265 /// adrp x0, _var@TLVPPAGE
7266 /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
7267 /// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
7268 /// ; the function pointer
7269 /// blr x1 ; Uses descriptor address in x0
7270 /// ; Address of _var is now in x0.
7272 /// If the address of _var's descriptor *is* known to the linker, then it can
7273 /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
7274 /// a slight efficiency gain.
7276 AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
7277 SelectionDAG &DAG) const {
7278 assert(Subtarget->isTargetDarwin() &&
7279 "This function expects a Darwin target");
7282 MVT PtrVT = getPointerTy(DAG.getDataLayout());
7283 MVT PtrMemVT = getPointerMemTy(DAG.getDataLayout());
7284 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
7287 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
7288 SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
7290 // The first entry in the descriptor is a function pointer that we must call
7291 // to obtain the address of the variable.
7292 SDValue Chain = DAG.getEntryNode();
7293 SDValue FuncTLVGet = DAG.getLoad(
7294 PtrMemVT, DL, Chain, DescAddr,
7295 MachinePointerInfo::getGOT(DAG.getMachineFunction()),
7296 Align(PtrMemVT.getSizeInBits() / 8),
7297 MachineMemOperand::MOInvariant | MachineMemOperand::MODereferenceable);
7298 Chain = FuncTLVGet.getValue(1);
7300 // Extend loaded pointer if necessary (i.e. if ILP32) to DAG pointer.
7301 FuncTLVGet = DAG.getZExtOrTrunc(FuncTLVGet, DL, PtrVT);
7303 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
7304 MFI.setAdjustsStack(true);
7306 // TLS calls preserve all registers except those that absolutely must be
7307 // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
7309 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
7310 const uint32_t *Mask = TRI->getTLSCallPreservedMask();
7311 if (Subtarget->hasCustomCallingConv())
7312 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
7314 // Finally, we can make the call. This is just a degenerate version of a
7315 // normal AArch64 call node: x0 takes the address of the descriptor, and
7316 // returns the address of the variable in this thread.
7317 Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
7319 DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
7320 Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
7321 DAG.getRegisterMask(Mask), Chain.getValue(1));
7322 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
7325 /// Convert a thread-local variable reference into a sequence of instructions to
7326 /// compute the variable's address for the local exec TLS model of ELF targets.
7327 /// The sequence depends on the maximum TLS area size.
7328 SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
7331 SelectionDAG &DAG) const {
7332 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7333 SDValue TPOff, Addr;
7335 switch (DAG.getTarget().Options.TLSSize) {
7337 llvm_unreachable("Unexpected TLS size");
7340 // mrs x0, TPIDR_EL0
7341 // add x0, x0, :tprel_lo12:a
7342 SDValue Var = DAG.getTargetGlobalAddress(
7343 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
7344 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
7346 DAG.getTargetConstant(0, DL, MVT::i32)),
7351 // mrs x0, TPIDR_EL0
7352 // add x0, x0, :tprel_hi12:a
7353 // add x0, x0, :tprel_lo12_nc:a
7354 SDValue HiVar = DAG.getTargetGlobalAddress(
7355 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
7356 SDValue LoVar = DAG.getTargetGlobalAddress(
7358 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7359 Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
7361 DAG.getTargetConstant(0, DL, MVT::i32)),
7363 return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
7365 DAG.getTargetConstant(0, DL, MVT::i32)),
7370 // mrs x1, TPIDR_EL0
7371 // movz x0, #:tprel_g1:a
7372 // movk x0, #:tprel_g0_nc:a
7374 SDValue HiVar = DAG.getTargetGlobalAddress(
7375 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
7376 SDValue LoVar = DAG.getTargetGlobalAddress(
7378 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
7379 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
7380 DAG.getTargetConstant(16, DL, MVT::i32)),
7382 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
7383 DAG.getTargetConstant(0, DL, MVT::i32)),
7385 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
7389 // mrs x1, TPIDR_EL0
7390 // movz x0, #:tprel_g2:a
7391 // movk x0, #:tprel_g1_nc:a
7392 // movk x0, #:tprel_g0_nc:a
7394 SDValue HiVar = DAG.getTargetGlobalAddress(
7395 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
7396 SDValue MiVar = DAG.getTargetGlobalAddress(
7398 AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
7399 SDValue LoVar = DAG.getTargetGlobalAddress(
7401 AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
7402 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
7403 DAG.getTargetConstant(32, DL, MVT::i32)),
7405 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
7406 DAG.getTargetConstant(16, DL, MVT::i32)),
7408 TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
7409 DAG.getTargetConstant(0, DL, MVT::i32)),
7411 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
7416 /// When accessing thread-local variables under either the general-dynamic or
7417 /// local-dynamic system, we make a "TLS-descriptor" call. The variable will
7418 /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
7419 /// is a function pointer to carry out the resolution.
7421 /// The sequence is:
7422 /// adrp x0, :tlsdesc:var
7423 /// ldr x1, [x0, #:tlsdesc_lo12:var]
7424 /// add x0, x0, #:tlsdesc_lo12:var
7425 /// .tlsdesccall var
7427 /// (TPIDR_EL0 offset now in x0)
7429 /// The above sequence must be produced unscheduled, to enable the linker to
7430 /// optimize/relax this sequence.
7431 /// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
7432 /// above sequence, and expanded really late in the compilation flow, to ensure
7433 /// the sequence is produced as per above.
7434 SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
7436 SelectionDAG &DAG) const {
7437 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7439 SDValue Chain = DAG.getEntryNode();
7440 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
7443 DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
7444 SDValue Glue = Chain.getValue(1);
7446 return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
7450 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
7451 SelectionDAG &DAG) const {
7452 assert(Subtarget->isTargetELF() && "This function expects an ELF target");
7454 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7456 TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
7458 if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
7459 if (Model == TLSModel::LocalDynamic)
7460 Model = TLSModel::GeneralDynamic;
7463 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
7464 Model != TLSModel::LocalExec)
7465 report_fatal_error("ELF TLS only supported in small memory model or "
7466 "in local exec TLS model");
7467 // Different choices can be made for the maximum size of the TLS area for a
7468 // module. For the small address model, the default TLS size is 16MiB and the
7469 // maximum TLS size is 4GiB.
7470 // FIXME: add tiny and large code model support for TLS access models other
7471 // than local exec. We currently generate the same code as small for tiny,
7472 // which may be larger than needed.
7475 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7477 const GlobalValue *GV = GA->getGlobal();
7479 SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
7481 if (Model == TLSModel::LocalExec) {
7482 return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
7483 } else if (Model == TLSModel::InitialExec) {
7484 TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
7485 TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
7486 } else if (Model == TLSModel::LocalDynamic) {
7487 // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
7488 // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
7489 // the beginning of the module's TLS region, followed by a DTPREL offset
7492 // These accesses will need deduplicating if there's more than one.
7493 AArch64FunctionInfo *MFI =
7494 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
7495 MFI->incNumLocalDynamicTLSAccesses();
7497 // The call needs a relocation too for linker relaxation. It doesn't make
7498 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
7500 SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
7503 // Now we can calculate the offset from TPIDR_EL0 to this module's
7504 // thread-local area.
7505 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
7507 // Now use :dtprel_whatever: operations to calculate this variable's offset
7508 // in its thread-storage area.
7509 SDValue HiVar = DAG.getTargetGlobalAddress(
7510 GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
7511 SDValue LoVar = DAG.getTargetGlobalAddress(
7512 GV, DL, MVT::i64, 0,
7513 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7515 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
7516 DAG.getTargetConstant(0, DL, MVT::i32)),
7518 TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
7519 DAG.getTargetConstant(0, DL, MVT::i32)),
7521 } else if (Model == TLSModel::GeneralDynamic) {
7522 // The call needs a relocation too for linker relaxation. It doesn't make
7523 // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
7526 DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
7528 // Finally we can make a call to calculate the offset from tpidr_el0.
7529 TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
7531 llvm_unreachable("Unsupported ELF TLS access model");
7533 return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
7537 AArch64TargetLowering::LowerWindowsGlobalTLSAddress(SDValue Op,
7538 SelectionDAG &DAG) const {
7539 assert(Subtarget->isTargetWindows() && "Windows specific TLS lowering");
7541 SDValue Chain = DAG.getEntryNode();
7542 EVT PtrVT = getPointerTy(DAG.getDataLayout());
7545 SDValue TEB = DAG.getRegister(AArch64::X18, MVT::i64);
7547 // Load the ThreadLocalStoragePointer from the TEB
7548 // A pointer to the TLS array is located at offset 0x58 from the TEB.
7550 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x58, DL));
7551 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
7552 Chain = TLSArray.getValue(1);
7554 // Load the TLS index from the C runtime;
7555 // This does the same as getAddr(), but without having a GlobalAddressSDNode.
7556 // This also does the same as LOADgot, but using a generic i32 load,
7557 // while LOADgot only loads i64.
7558 SDValue TLSIndexHi =
7559 DAG.getTargetExternalSymbol("_tls_index", PtrVT, AArch64II::MO_PAGE);
7560 SDValue TLSIndexLo = DAG.getTargetExternalSymbol(
7561 "_tls_index", PtrVT, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7562 SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, TLSIndexHi);
7564 DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, TLSIndexLo);
7565 TLSIndex = DAG.getLoad(MVT::i32, DL, Chain, TLSIndex, MachinePointerInfo());
7566 Chain = TLSIndex.getValue(1);
7568 // The pointer to the thread's TLS data area is at the TLS Index scaled by 8
7569 // offset into the TLSArray.
7570 TLSIndex = DAG.getNode(ISD::ZERO_EXTEND, DL, PtrVT, TLSIndex);
7571 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
7572 DAG.getConstant(3, DL, PtrVT));
7573 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
7574 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
7575 MachinePointerInfo());
7576 Chain = TLS.getValue(1);
7578 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7579 const GlobalValue *GV = GA->getGlobal();
7580 SDValue TGAHi = DAG.getTargetGlobalAddress(
7581 GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
7582 SDValue TGALo = DAG.getTargetGlobalAddress(
7584 AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
7586 // Add the offset from the start of the .tls section (section base).
7588 SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TLS, TGAHi,
7589 DAG.getTargetConstant(0, DL, MVT::i32)),
7591 Addr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, Addr, TGALo);
7595 SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
7596 SelectionDAG &DAG) const {
7597 const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
7598 if (DAG.getTarget().useEmulatedTLS())
7599 return LowerToTLSEmulatedModel(GA, DAG);
7601 if (Subtarget->isTargetDarwin())
7602 return LowerDarwinGlobalTLSAddress(Op, DAG);
7603 if (Subtarget->isTargetELF())
7604 return LowerELFGlobalTLSAddress(Op, DAG);
7605 if (Subtarget->isTargetWindows())
7606 return LowerWindowsGlobalTLSAddress(Op, DAG);
7608 llvm_unreachable("Unexpected platform trying to use TLS");
7611 // Looks through \param Val to determine the bit that can be used to
7612 // check the sign of the value. It returns the unextended value and
7613 // the sign bit position.
7614 std::pair<SDValue, uint64_t> lookThroughSignExtension(SDValue Val) {
7615 if (Val.getOpcode() == ISD::SIGN_EXTEND_INREG)
7616 return {Val.getOperand(0),
7617 cast<VTSDNode>(Val.getOperand(1))->getVT().getFixedSizeInBits() -
7620 if (Val.getOpcode() == ISD::SIGN_EXTEND)
7621 return {Val.getOperand(0),
7622 Val.getOperand(0)->getValueType(0).getFixedSizeInBits() - 1};
7624 return {Val, Val.getValueSizeInBits() - 1};
7627 SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
7628 SDValue Chain = Op.getOperand(0);
7629 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
7630 SDValue LHS = Op.getOperand(2);
7631 SDValue RHS = Op.getOperand(3);
7632 SDValue Dest = Op.getOperand(4);
7635 MachineFunction &MF = DAG.getMachineFunction();
7636 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
7637 // will not be produced, as they are conditional branch instructions that do
7639 bool ProduceNonFlagSettingCondBr =
7640 !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
7642 // Handle f128 first, since lowering it will result in comparing the return
7643 // value of a libcall against zero, which is just what the rest of LowerBR_CC
7644 // is expecting to deal with.
7645 if (LHS.getValueType() == MVT::f128) {
7646 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
7648 // If softenSetCCOperands returned a scalar, we need to compare the result
7649 // against zero to select between true and false values.
7650 if (!RHS.getNode()) {
7651 RHS = DAG.getConstant(0, dl, LHS.getValueType());
7656 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
7658 if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
7659 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
7660 // Only lower legal XALUO ops.
7661 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
7664 // The actual operation with overflow check.
7665 AArch64CC::CondCode OFCC;
7666 SDValue Value, Overflow;
7667 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
7669 if (CC == ISD::SETNE)
7670 OFCC = getInvertedCondCode(OFCC);
7671 SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);
7673 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
7677 if (LHS.getValueType().isInteger()) {
7678 assert((LHS.getValueType() == RHS.getValueType()) &&
7679 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
7681 // If the RHS of the comparison is zero, we can potentially fold this
7682 // to a specialized branch.
7683 const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
7684 if (RHSC && RHSC->getZExtValue() == 0 && ProduceNonFlagSettingCondBr) {
7685 if (CC == ISD::SETEQ) {
7686 // See if we can use a TBZ to fold in an AND as well.
7687 // TBZ has a smaller branch displacement than CBZ. If the offset is
7688 // out of bounds, a late MI-layer pass rewrites branches.
7689 // 403.gcc is an example that hits this case.
7690 if (LHS.getOpcode() == ISD::AND &&
7691 isa<ConstantSDNode>(LHS.getOperand(1)) &&
7692 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
7693 SDValue Test = LHS.getOperand(0);
7694 uint64_t Mask = LHS.getConstantOperandVal(1);
7695 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
7696 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
7700 return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
7701 } else if (CC == ISD::SETNE) {
7702 // See if we can use a TBZ to fold in an AND as well.
7703 // TBZ has a smaller branch displacement than CBZ. If the offset is
7704 // out of bounds, a late MI-layer pass rewrites branches.
7705 // 403.gcc is an example that hits this case.
7706 if (LHS.getOpcode() == ISD::AND &&
7707 isa<ConstantSDNode>(LHS.getOperand(1)) &&
7708 isPowerOf2_64(LHS.getConstantOperandVal(1))) {
7709 SDValue Test = LHS.getOperand(0);
7710 uint64_t Mask = LHS.getConstantOperandVal(1);
7711 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
7712 DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
7716 return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
7717 } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
7718 // Don't combine AND since emitComparison converts the AND to an ANDS
7719 // (a.k.a. TST) and the test in the test bit and branch instruction
7720 // becomes redundant. This would also increase register pressure.
7721 uint64_t SignBitPos;
7722 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
7723 return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
7724 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
7727 if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
7728 LHS.getOpcode() != ISD::AND && ProduceNonFlagSettingCondBr) {
7729 // Don't combine AND since emitComparison converts the AND to an ANDS
7730 // (a.k.a. TST) and the test in the test bit and branch instruction
7731 // becomes redundant. This would also increase register pressure.
7732 uint64_t SignBitPos;
7733 std::tie(LHS, SignBitPos) = lookThroughSignExtension(LHS);
7734 return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
7735 DAG.getConstant(SignBitPos, dl, MVT::i64), Dest);
7739 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
7740 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
7744 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
7745 LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
7747 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
7748 // clean. Some of them require two branches to implement.
7749 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
7750 AArch64CC::CondCode CC1, CC2;
7751 changeFPCCToAArch64CC(CC, CC1, CC2);
7752 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
7754 DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
7755 if (CC2 != AArch64CC::AL) {
7756 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
7757 return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
7764 SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
7765 SelectionDAG &DAG) const {
7766 if (!Subtarget->hasNEON())
7769 EVT VT = Op.getValueType();
7770 EVT IntVT = VT.changeTypeToInteger();
7773 SDValue In1 = Op.getOperand(0);
7774 SDValue In2 = Op.getOperand(1);
7775 EVT SrcVT = In2.getValueType();
7777 if (SrcVT.bitsLT(VT))
7778 In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
7779 else if (SrcVT.bitsGT(VT))
7780 In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2,
7781 DAG.getIntPtrConstant(0, DL, /*isTarget=*/true));
7783 if (VT.isScalableVector())
7785 getPackedSVEVectorVT(VT.getVectorElementType().changeTypeToInteger());
7787 if (VT != In2.getValueType())
7790 auto BitCast = [this](EVT VT, SDValue Op, SelectionDAG &DAG) {
7791 if (VT.isScalableVector())
7792 return getSVESafeBitCast(VT, Op, DAG);
7794 return DAG.getBitcast(VT, Op);
7797 SDValue VecVal1, VecVal2;
7799 auto SetVecVal = [&](int Idx = -1) {
7800 if (!VT.isVector()) {
7802 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In1);
7804 DAG.getTargetInsertSubreg(Idx, DL, VecVT, DAG.getUNDEF(VecVT), In2);
7806 VecVal1 = BitCast(VecVT, In1, DAG);
7807 VecVal2 = BitCast(VecVT, In2, DAG);
7810 if (VT.isVector()) {
7813 } else if (VT == MVT::f64) {
7815 SetVecVal(AArch64::dsub);
7816 } else if (VT == MVT::f32) {
7818 SetVecVal(AArch64::ssub);
7819 } else if (VT == MVT::f16) {
7821 SetVecVal(AArch64::hsub);
7823 llvm_unreachable("Invalid type for copysign!");
7826 unsigned BitWidth = In1.getScalarValueSizeInBits();
7827 SDValue SignMaskV = DAG.getConstant(~APInt::getSignMask(BitWidth), DL, VecVT);
7829 // We want to materialize a mask with every bit but the high bit set, but the
7830 // AdvSIMD immediate moves cannot materialize that in a single instruction for
7831 // 64-bit elements. Instead, materialize all bits set and then negate that.
7832 if (VT == MVT::f64 || VT == MVT::v2f64) {
7833 SignMaskV = DAG.getConstant(APInt::getAllOnes(BitWidth), DL, VecVT);
7834 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, SignMaskV);
7835 SignMaskV = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, SignMaskV);
7836 SignMaskV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, SignMaskV);
7840 DAG.getNode(AArch64ISD::BSP, DL, VecVT, SignMaskV, VecVal1, VecVal2);
7842 return DAG.getTargetExtractSubreg(AArch64::hsub, DL, VT, BSP);
7844 return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, BSP);
7846 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, BSP);
7848 return BitCast(VT, BSP, DAG);
7851 SDValue AArch64TargetLowering::LowerCTPOP_PARITY(SDValue Op,
7852 SelectionDAG &DAG) const {
7853 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
7854 Attribute::NoImplicitFloat))
7857 if (!Subtarget->hasNEON())
7860 bool IsParity = Op.getOpcode() == ISD::PARITY;
7862 // While there is no integer popcount instruction, it can
7863 // be more efficiently lowered to the following sequence that uses
7864 // AdvSIMD registers/instructions as long as the copies to/from
7865 // the AdvSIMD registers are cheap.
7866 // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
7867 // CNT V0.8B, V0.8B // 8xbyte pop-counts
7868 // ADDV B0, V0.8B // sum 8xbyte pop-counts
7869 // UMOV X0, V0.B[0] // copy byte result back to integer reg
7870 SDValue Val = Op.getOperand(0);
7872 EVT VT = Op.getValueType();
7874 if (VT == MVT::i32 || VT == MVT::i64) {
7876 Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
7877 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
7879 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
7880 SDValue UaddLV = DAG.getNode(
7881 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
7882 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7885 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
7886 DAG.getConstant(1, DL, MVT::i32));
7889 UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
7891 } else if (VT == MVT::i128) {
7892 Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
7894 SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
7895 SDValue UaddLV = DAG.getNode(
7896 ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
7897 DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
7900 UaddLV = DAG.getNode(ISD::AND, DL, MVT::i32, UaddLV,
7901 DAG.getConstant(1, DL, MVT::i32));
7903 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
7906 assert(!IsParity && "ISD::PARITY of vector types not supported");
7908 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
7909 return LowerToPredicatedOp(Op, DAG, AArch64ISD::CTPOP_MERGE_PASSTHRU);
7911 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
7912 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
7913 "Unexpected type for custom ctpop lowering");
7915 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
7916 Val = DAG.getBitcast(VT8Bit, Val);
7917 Val = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Val);
7919 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
7920 unsigned EltSize = 8;
7921 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
7922 while (EltSize != VT.getScalarSizeInBits()) {
7925 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
7927 ISD::INTRINSIC_WO_CHAIN, DL, WidenVT,
7928 DAG.getConstant(Intrinsic::aarch64_neon_uaddlp, DL, MVT::i32), Val);
7934 SDValue AArch64TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) const {
7935 EVT VT = Op.getValueType();
7936 assert(VT.isScalableVector() ||
7937 useSVEForFixedLengthVectorVT(
7938 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()));
7941 SDValue RBIT = DAG.getNode(ISD::BITREVERSE, DL, VT, Op.getOperand(0));
7942 return DAG.getNode(ISD::CTLZ, DL, VT, RBIT);
7945 SDValue AArch64TargetLowering::LowerMinMax(SDValue Op,
7946 SelectionDAG &DAG) const {
7948 EVT VT = Op.getValueType();
7950 unsigned Opcode = Op.getOpcode();
7954 llvm_unreachable("Wrong instruction");
7969 if (VT.isScalableVector() ||
7970 useSVEForFixedLengthVectorVT(
7971 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
7974 llvm_unreachable("Wrong instruction");
7976 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_PRED);
7978 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_PRED);
7980 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_PRED);
7982 return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_PRED);
7986 SDValue Op0 = Op.getOperand(0);
7987 SDValue Op1 = Op.getOperand(1);
7988 SDValue Cond = DAG.getSetCC(DL, VT, Op0, Op1, CC);
7989 return DAG.getSelect(DL, VT, Cond, Op0, Op1);
7992 SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
7993 SelectionDAG &DAG) const {
7994 EVT VT = Op.getValueType();
7996 if (VT.isScalableVector() ||
7997 useSVEForFixedLengthVectorVT(
7998 VT, /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors()))
7999 return LowerToPredicatedOp(Op, DAG, AArch64ISD::BITREVERSE_MERGE_PASSTHRU);
8005 switch (VT.getSimpleVT().SimpleTy) {
8007 llvm_unreachable("Invalid type for bitreverse!");
8011 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8018 REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
8025 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8032 REVB = DAG.getNode(AArch64ISD::REV64, DL, VST, Op.getOperand(0));
8038 return DAG.getNode(AArch64ISD::NVCAST, DL, VT,
8039 DAG.getNode(ISD::BITREVERSE, DL, VST, REVB));
8042 SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
8044 if (Op.getValueType().isVector())
8045 return LowerVSETCC(Op, DAG);
8047 bool IsStrict = Op->isStrictFPOpcode();
8048 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
8049 unsigned OpNo = IsStrict ? 1 : 0;
8052 Chain = Op.getOperand(0);
8053 SDValue LHS = Op.getOperand(OpNo + 0);
8054 SDValue RHS = Op.getOperand(OpNo + 1);
8055 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
8058 // We chose ZeroOrOneBooleanContents, so use zero and one.
8059 EVT VT = Op.getValueType();
8060 SDValue TVal = DAG.getConstant(1, dl, VT);
8061 SDValue FVal = DAG.getConstant(0, dl, VT);
8063 // Handle f128 first, since one possible outcome is a normal integer
8064 // comparison which gets picked up by the next if statement.
8065 if (LHS.getValueType() == MVT::f128) {
8066 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
8069 // If softenSetCCOperands returned a scalar, use it.
8070 if (!RHS.getNode()) {
8071 assert(LHS.getValueType() == Op.getValueType() &&
8072 "Unexpected setcc expansion!");
8073 return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
8077 if (LHS.getValueType().isInteger()) {
8079 SDValue Cmp = getAArch64Cmp(
8080 LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
8082 // Note that we inverted the condition above, so we reverse the order of
8083 // the true and false operands here. This will allow the setcc to be
8084 // matched to a single CSINC instruction.
8085 SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
8086 return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
8089 // Now we know we're dealing with FP values.
8090 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
8091 LHS.getValueType() == MVT::f64);
8093 // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
8094 // and do the comparison.
8097 Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
8099 Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8101 AArch64CC::CondCode CC1, CC2;
8102 changeFPCCToAArch64CC(CC, CC1, CC2);
8104 if (CC2 == AArch64CC::AL) {
8105 changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
8107 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8109 // Note that we inverted the condition above, so we reverse the order of
8110 // the true and false operands here. This will allow the setcc to be
8111 // matched to a single CSINC instruction.
8112 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
8114 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
8115 // totally clean. Some of them require two CSELs to implement. As is in
8116 // this case, we emit the first CSEL and then emit a second using the output
8117 // of the first as the RHS. We're effectively OR'ing the two CC's together.
8119 // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
8120 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8122 DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
8124 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8125 Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
8127 return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
8130 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
8131 SDValue RHS, SDValue TVal,
8132 SDValue FVal, const SDLoc &dl,
8133 SelectionDAG &DAG) const {
8134 // Handle f128 first, because it will result in a comparison of some RTLIB
8135 // call result against zero.
8136 if (LHS.getValueType() == MVT::f128) {
8137 softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
8139 // If softenSetCCOperands returned a scalar, we need to compare the result
8140 // against zero to select between true and false values.
8141 if (!RHS.getNode()) {
8142 RHS = DAG.getConstant(0, dl, LHS.getValueType());
8147 // Also handle f16, for which we need to do a f32 comparison.
8148 if (LHS.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
8149 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
8150 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
8153 // Next, handle integers.
8154 if (LHS.getValueType().isInteger()) {
8155 assert((LHS.getValueType() == RHS.getValueType()) &&
8156 (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
8158 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
8159 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
8160 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
8161 // Check for sign pattern (SELECT_CC setgt, iN lhs, -1, 1, -1) and transform
8162 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
8164 if (CC == ISD::SETGT && RHSC && RHSC->isAllOnes() && CTVal && CFVal &&
8165 CTVal->isOne() && CFVal->isAllOnes() &&
8166 LHS.getValueType() == TVal.getValueType()) {
8167 EVT VT = LHS.getValueType();
8169 DAG.getNode(ISD::SRA, dl, VT, LHS,
8170 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
8171 return DAG.getNode(ISD::OR, dl, VT, Shift, DAG.getConstant(1, dl, VT));
8174 unsigned Opcode = AArch64ISD::CSEL;
8176 // If both the TVal and the FVal are constants, see if we can swap them in
8177 // order to for a CSINV or CSINC out of them.
8178 if (CTVal && CFVal && CTVal->isAllOnes() && CFVal->isZero()) {
8179 std::swap(TVal, FVal);
8180 std::swap(CTVal, CFVal);
8181 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8182 } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isZero()) {
8183 std::swap(TVal, FVal);
8184 std::swap(CTVal, CFVal);
8185 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8186 } else if (TVal.getOpcode() == ISD::XOR) {
8187 // If TVal is a NOT we want to swap TVal and FVal so that we can match
8188 // with a CSINV rather than a CSEL.
8189 if (isAllOnesConstant(TVal.getOperand(1))) {
8190 std::swap(TVal, FVal);
8191 std::swap(CTVal, CFVal);
8192 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8194 } else if (TVal.getOpcode() == ISD::SUB) {
8195 // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
8196 // that we can match with a CSNEG rather than a CSEL.
8197 if (isNullConstant(TVal.getOperand(0))) {
8198 std::swap(TVal, FVal);
8199 std::swap(CTVal, CFVal);
8200 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8202 } else if (CTVal && CFVal) {
8203 const int64_t TrueVal = CTVal->getSExtValue();
8204 const int64_t FalseVal = CFVal->getSExtValue();
8207 // If both TVal and FVal are constants, see if FVal is the
8208 // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
8209 // instead of a CSEL in that case.
8210 if (TrueVal == ~FalseVal) {
8211 Opcode = AArch64ISD::CSINV;
8212 } else if (FalseVal > std::numeric_limits<int64_t>::min() &&
8213 TrueVal == -FalseVal) {
8214 Opcode = AArch64ISD::CSNEG;
8215 } else if (TVal.getValueType() == MVT::i32) {
8216 // If our operands are only 32-bit wide, make sure we use 32-bit
8217 // arithmetic for the check whether we can use CSINC. This ensures that
8218 // the addition in the check will wrap around properly in case there is
8219 // an overflow (which would not be the case if we do the check with
8220 // 64-bit arithmetic).
8221 const uint32_t TrueVal32 = CTVal->getZExtValue();
8222 const uint32_t FalseVal32 = CFVal->getZExtValue();
8224 if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
8225 Opcode = AArch64ISD::CSINC;
8227 if (TrueVal32 > FalseVal32) {
8231 // 64-bit check whether we can use CSINC.
8232 } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
8233 Opcode = AArch64ISD::CSINC;
8235 if (TrueVal > FalseVal) {
8240 // Swap TVal and FVal if necessary.
8242 std::swap(TVal, FVal);
8243 std::swap(CTVal, CFVal);
8244 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
8247 if (Opcode != AArch64ISD::CSEL) {
8248 // Drop FVal since we can get its value by simply inverting/negating
8254 // Avoid materializing a constant when possible by reusing a known value in
8255 // a register. However, don't perform this optimization if the known value
8256 // is one, zero or negative one in the case of a CSEL. We can always
8257 // materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
8258 // FVal, respectively.
8259 ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
8260 if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
8261 !RHSVal->isZero() && !RHSVal->isAllOnes()) {
8262 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
8263 // Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
8264 // "a != C ? x : a" to avoid materializing C.
8265 if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
8267 else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
8269 } else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
8270 assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
8271 // Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
8272 // avoid materializing C.
8273 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
8274 if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
8275 Opcode = AArch64ISD::CSINV;
8277 FVal = DAG.getConstant(0, dl, FVal.getValueType());
8282 SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
8283 EVT VT = TVal.getValueType();
8284 return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
8287 // Now we know we're dealing with FP values.
8288 assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
8289 LHS.getValueType() == MVT::f64);
8290 assert(LHS.getValueType() == RHS.getValueType());
8291 EVT VT = TVal.getValueType();
8292 SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
8294 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
8295 // clean. Some of them require two CSELs to implement.
8296 AArch64CC::CondCode CC1, CC2;
8297 changeFPCCToAArch64CC(CC, CC1, CC2);
8299 if (DAG.getTarget().Options.UnsafeFPMath) {
8300 // Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
8301 // "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
8302 ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
8303 if (RHSVal && RHSVal->isZero()) {
8304 ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
8305 ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);
8307 if ((CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ) &&
8308 CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
8310 else if ((CC == ISD::SETNE || CC == ISD::SETONE || CC == ISD::SETUNE) &&
8311 CFVal && CFVal->isZero() &&
8312 FVal.getValueType() == LHS.getValueType())
8317 // Emit first, and possibly only, CSEL.
8318 SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
8319 SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
8321 // If we need a second CSEL, emit it, using the output of the first as the
8322 // RHS. We're effectively OR'ing the two CC's together.
8323 if (CC2 != AArch64CC::AL) {
8324 SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
8325 return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
8328 // Otherwise, return the output of the first CSEL.
8332 SDValue AArch64TargetLowering::LowerVECTOR_SPLICE(SDValue Op,
8333 SelectionDAG &DAG) const {
8334 EVT Ty = Op.getValueType();
8335 auto Idx = Op.getConstantOperandAPInt(2);
8336 int64_t IdxVal = Idx.getSExtValue();
8337 assert(Ty.isScalableVector() &&
8338 "Only expect scalable vectors for custom lowering of VECTOR_SPLICE");
8340 // We can use the splice instruction for certain index values where we are
8341 // able to efficiently generate the correct predicate. The index will be
8342 // inverted and used directly as the input to the ptrue instruction, i.e.
8343 // -1 -> vl1, -2 -> vl2, etc. The predicate will then be reversed to get the
8344 // splice predicate. However, we can only do this if we can guarantee that
8345 // there are enough elements in the vector, hence we check the index <= min
8346 // number of elements.
8347 Optional<unsigned> PredPattern;
8348 if (Ty.isScalableVector() && IdxVal < 0 &&
8349 (PredPattern = getSVEPredPatternFromNumElements(std::abs(IdxVal))) !=
8353 // Create a predicate where all but the last -IdxVal elements are false.
8354 EVT PredVT = Ty.changeVectorElementType(MVT::i1);
8355 SDValue Pred = getPTrue(DAG, DL, PredVT, *PredPattern);
8356 Pred = DAG.getNode(ISD::VECTOR_REVERSE, DL, PredVT, Pred);
8358 // Now splice the two inputs together using the predicate.
8359 return DAG.getNode(AArch64ISD::SPLICE, DL, Ty, Pred, Op.getOperand(0),
8363 // This will select to an EXT instruction, which has a maximum immediate
8364 // value of 255, hence 2048-bits is the maximum value we can lower.
8366 IdxVal < int64_t(2048 / Ty.getVectorElementType().getSizeInBits()))
8372 SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
8373 SelectionDAG &DAG) const {
8374 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
8375 SDValue LHS = Op.getOperand(0);
8376 SDValue RHS = Op.getOperand(1);
8377 SDValue TVal = Op.getOperand(2);
8378 SDValue FVal = Op.getOperand(3);
8380 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
8383 SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
8384 SelectionDAG &DAG) const {
8385 SDValue CCVal = Op->getOperand(0);
8386 SDValue TVal = Op->getOperand(1);
8387 SDValue FVal = Op->getOperand(2);
8390 EVT Ty = Op.getValueType();
8391 if (Ty.isScalableVector()) {
8392 SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
8393 MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
8394 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
8395 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
8398 if (useSVEForFixedLengthVectorVT(Ty)) {
8399 // FIXME: Ideally this would be the same as above using i1 types, however
8400 // for the moment we can't deal with fixed i1 vector types properly, so
8401 // instead extend the predicate to a result type sized integer vector.
8402 MVT SplatValVT = MVT::getIntegerVT(Ty.getScalarSizeInBits());
8403 MVT PredVT = MVT::getVectorVT(SplatValVT, Ty.getVectorElementCount());
8404 SDValue SplatVal = DAG.getSExtOrTrunc(CCVal, DL, SplatValVT);
8405 SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, SplatVal);
8406 return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
8409 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
8411 if (ISD::isOverflowIntrOpRes(CCVal)) {
8412 // Only lower legal XALUO ops.
8413 if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
8416 AArch64CC::CondCode OFCC;
8417 SDValue Value, Overflow;
8418 std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
8419 SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);
8421 return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
8425 // Lower it the same way as we would lower a SELECT_CC node.
8428 if (CCVal.getOpcode() == ISD::SETCC) {
8429 LHS = CCVal.getOperand(0);
8430 RHS = CCVal.getOperand(1);
8431 CC = cast<CondCodeSDNode>(CCVal.getOperand(2))->get();
8434 RHS = DAG.getConstant(0, DL, CCVal.getValueType());
8437 return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
8440 SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
8441 SelectionDAG &DAG) const {
8442 // Jump table entries as PC relative offsets. No additional tweaking
8443 // is necessary here. Just get the address of the jump table.
8444 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
8446 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8447 !Subtarget->isTargetMachO()) {
8448 return getAddrLarge(JT, DAG);
8449 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8450 return getAddrTiny(JT, DAG);
8452 return getAddr(JT, DAG);
8455 SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
8456 SelectionDAG &DAG) const {
8457 // Jump table entries as PC relative offsets. No additional tweaking
8458 // is necessary here. Just get the address of the jump table.
8460 SDValue JT = Op.getOperand(1);
8461 SDValue Entry = Op.getOperand(2);
8462 int JTI = cast<JumpTableSDNode>(JT.getNode())->getIndex();
8464 auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
8465 AFI->setJumpTableEntryInfo(JTI, 4, nullptr);
8468 DAG.getMachineNode(AArch64::JumpTableDest32, DL, MVT::i64, MVT::i64, JT,
8469 Entry, DAG.getTargetJumpTable(JTI, MVT::i32));
8470 return DAG.getNode(ISD::BRIND, DL, MVT::Other, Op.getOperand(0),
8474 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
8475 SelectionDAG &DAG) const {
8476 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
8478 if (getTargetMachine().getCodeModel() == CodeModel::Large) {
8479 // Use the GOT for the large code model on iOS.
8480 if (Subtarget->isTargetMachO()) {
8481 return getGOT(CP, DAG);
8483 return getAddrLarge(CP, DAG);
8484 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8485 return getAddrTiny(CP, DAG);
8487 return getAddr(CP, DAG);
8491 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
8492 SelectionDAG &DAG) const {
8493 BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
8494 if (getTargetMachine().getCodeModel() == CodeModel::Large &&
8495 !Subtarget->isTargetMachO()) {
8496 return getAddrLarge(BA, DAG);
8497 } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
8498 return getAddrTiny(BA, DAG);
8500 return getAddr(BA, DAG);
8503 SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
8504 SelectionDAG &DAG) const {
8505 AArch64FunctionInfo *FuncInfo =
8506 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
8509 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
8510 getPointerTy(DAG.getDataLayout()));
8511 FR = DAG.getZExtOrTrunc(FR, DL, getPointerMemTy(DAG.getDataLayout()));
8512 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
8513 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
8514 MachinePointerInfo(SV));
8517 SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
8518 SelectionDAG &DAG) const {
8519 AArch64FunctionInfo *FuncInfo =
8520 DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
8523 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
8524 ? FuncInfo->getVarArgsGPRIndex()
8525 : FuncInfo->getVarArgsStackIndex(),
8526 getPointerTy(DAG.getDataLayout()));
8527 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
8528 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
8529 MachinePointerInfo(SV));
8532 SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
8533 SelectionDAG &DAG) const {
8534 // The layout of the va_list struct is specified in the AArch64 Procedure Call
8535 // Standard, section B.3.
8536 MachineFunction &MF = DAG.getMachineFunction();
8537 AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
8538 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
8539 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8540 auto PtrVT = getPointerTy(DAG.getDataLayout());
8543 SDValue Chain = Op.getOperand(0);
8544 SDValue VAList = Op.getOperand(1);
8545 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
8546 SmallVector<SDValue, 4> MemOps;
8548 // void *__stack at offset 0
8549 unsigned Offset = 0;
8550 SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
8551 Stack = DAG.getZExtOrTrunc(Stack, DL, PtrMemVT);
8552 MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
8553 MachinePointerInfo(SV), Align(PtrSize)));
8555 // void *__gr_top at offset 8 (4 on ILP32)
8557 int GPRSize = FuncInfo->getVarArgsGPRSize();
8559 SDValue GRTop, GRTopAddr;
8561 GRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
8562 DAG.getConstant(Offset, DL, PtrVT));
8564 GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
8565 GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
8566 DAG.getConstant(GPRSize, DL, PtrVT));
8567 GRTop = DAG.getZExtOrTrunc(GRTop, DL, PtrMemVT);
8569 MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
8570 MachinePointerInfo(SV, Offset),
8574 // void *__vr_top at offset 16 (8 on ILP32)
8576 int FPRSize = FuncInfo->getVarArgsFPRSize();
8578 SDValue VRTop, VRTopAddr;
8579 VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
8580 DAG.getConstant(Offset, DL, PtrVT));
8582 VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
8583 VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
8584 DAG.getConstant(FPRSize, DL, PtrVT));
8585 VRTop = DAG.getZExtOrTrunc(VRTop, DL, PtrMemVT);
8587 MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
8588 MachinePointerInfo(SV, Offset),
8592 // int __gr_offs at offset 24 (12 on ILP32)
8594 SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
8595 DAG.getConstant(Offset, DL, PtrVT));
8597 DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32),
8598 GROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
8600 // int __vr_offs at offset 28 (16 on ILP32)
8602 SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
8603 DAG.getConstant(Offset, DL, PtrVT));
8605 DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32),
8606 VROffsAddr, MachinePointerInfo(SV, Offset), Align(4)));
8608 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
8611 SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
8612 SelectionDAG &DAG) const {
8613 MachineFunction &MF = DAG.getMachineFunction();
8615 if (Subtarget->isCallingConvWin64(MF.getFunction().getCallingConv()))
8616 return LowerWin64_VASTART(Op, DAG);
8617 else if (Subtarget->isTargetDarwin())
8618 return LowerDarwin_VASTART(Op, DAG);
8620 return LowerAAPCS_VASTART(Op, DAG);
8623 SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
8624 SelectionDAG &DAG) const {
8625 // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
8628 unsigned PtrSize = Subtarget->isTargetILP32() ? 4 : 8;
8629 unsigned VaListSize =
8630 (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
8632 : Subtarget->isTargetILP32() ? 20 : 32;
8633 const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
8634 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
8636 return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
8637 DAG.getConstant(VaListSize, DL, MVT::i32),
8638 Align(PtrSize), false, false, false,
8639 MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
8642 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
8643 assert(Subtarget->isTargetDarwin() &&
8644 "automatic va_arg instruction only works on Darwin");
8646 const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
8647 EVT VT = Op.getValueType();
8649 SDValue Chain = Op.getOperand(0);
8650 SDValue Addr = Op.getOperand(1);
8651 MaybeAlign Align(Op.getConstantOperandVal(3));
8652 unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
8653 auto PtrVT = getPointerTy(DAG.getDataLayout());
8654 auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
8656 DAG.getLoad(PtrMemVT, DL, Chain, Addr, MachinePointerInfo(V));
8657 Chain = VAList.getValue(1);
8658 VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
8660 if (VT.isScalableVector())
8661 report_fatal_error("Passing SVE types to variadic functions is "
8662 "currently not supported");
8664 if (Align && *Align > MinSlotSize) {
8665 VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
8666 DAG.getConstant(Align->value() - 1, DL, PtrVT));
8667 VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
8668 DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
8671 Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
8672 unsigned ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
8674 // Scalar integer and FP values smaller than 64 bits are implicitly extended
8675 // up to 64 bits. At the very least, we have to increase the striding of the
8676 // vaargs list to match this, and for FP values we need to introduce
8677 // FP_ROUND nodes as well.
8678 if (VT.isInteger() && !VT.isVector())
8679 ArgSize = std::max(ArgSize, MinSlotSize);
8680 bool NeedFPTrunc = false;
8681 if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
8686 // Increment the pointer, VAList, to the next vaarg
8687 SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
8688 DAG.getConstant(ArgSize, DL, PtrVT));
8689 VANext = DAG.getZExtOrTrunc(VANext, DL, PtrMemVT);
8691 // Store the incremented VAList to the legalized pointer
8693 DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));
8695 // Load the actual argument out of the pointer VAList
8697 // Load the value as an f64.
8699 DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
8700 // Round the value down to an f32.
8702 DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
8703 DAG.getIntPtrConstant(1, DL, /*isTarget=*/true));
8704 SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
8705 // Merge the rounded value with the chain output of the load.
8706 return DAG.getMergeValues(Ops, DL);
8709 return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
8712 SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
8713 SelectionDAG &DAG) const {
8714 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8715 MFI.setFrameAddressIsTaken(true);
8717 EVT VT = Op.getValueType();
8719 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8721 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, MVT::i64);
8723 FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
8724 MachinePointerInfo());
8726 if (Subtarget->isTargetILP32())
8727 FrameAddr = DAG.getNode(ISD::AssertZext, DL, MVT::i64, FrameAddr,
8728 DAG.getValueType(VT));
8733 SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
8734 SelectionDAG &DAG) const {
8735 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
8737 EVT VT = getPointerTy(DAG.getDataLayout());
8739 int FI = MFI.CreateFixedObject(4, 0, false);
8740 return DAG.getFrameIndex(FI, VT);
8743 #define GET_REGISTER_MATCHER
8744 #include "AArch64GenAsmMatcher.inc"
8746 // FIXME? Maybe this could be a TableGen attribute on some registers and
8747 // this table could be generated automatically from RegInfo.
8748 Register AArch64TargetLowering::
8749 getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
8750 Register Reg = MatchRegisterName(RegName);
8751 if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
8752 const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
8753 unsigned DwarfRegNum = MRI->getDwarfRegNum(Reg, false);
8754 if (!Subtarget->isXRegisterReserved(DwarfRegNum))
8759 report_fatal_error(Twine("Invalid register name \""
8760 + StringRef(RegName) + "\"."));
8763 SDValue AArch64TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
8764 SelectionDAG &DAG) const {
8765 DAG.getMachineFunction().getFrameInfo().setFrameAddressIsTaken(true);
8767 EVT VT = Op.getValueType();
8771 DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
8772 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
8774 return DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset);
8777 SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
8778 SelectionDAG &DAG) const {
8779 MachineFunction &MF = DAG.getMachineFunction();
8780 MachineFrameInfo &MFI = MF.getFrameInfo();
8781 MFI.setReturnAddressIsTaken(true);
8783 EVT VT = Op.getValueType();
8785 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
8786 SDValue ReturnAddress;
8788 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
8789 SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
8790 ReturnAddress = DAG.getLoad(
8791 VT, DL, DAG.getEntryNode(),
8792 DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), MachinePointerInfo());
8794 // Return LR, which contains the return address. Mark it an implicit
8796 Register Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
8797 ReturnAddress = DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
8800 // The XPACLRI instruction assembles to a hint-space instruction before
8801 // Armv8.3-A therefore this instruction can be safely used for any pre
8802 // Armv8.3-A architectures. On Armv8.3-A and onwards XPACI is available so use
8805 if (Subtarget->hasPAuth()) {
8806 St = DAG.getMachineNode(AArch64::XPACI, DL, VT, ReturnAddress);
8808 // XPACLRI operates on LR therefore we must move the operand accordingly.
8810 DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::LR, ReturnAddress);
8811 St = DAG.getMachineNode(AArch64::XPACLRI, DL, VT, Chain);
8813 return SDValue(St, 0);
8816 /// LowerShiftParts - Lower SHL_PARTS/SRA_PARTS/SRL_PARTS, which returns two
8817 /// i32 values and take a 2 x i32 value to shift plus a shift amount.
8818 SDValue AArch64TargetLowering::LowerShiftParts(SDValue Op,
8819 SelectionDAG &DAG) const {
8821 expandShiftParts(Op.getNode(), Lo, Hi, DAG);
8822 return DAG.getMergeValues({Lo, Hi}, SDLoc(Op));
8825 bool AArch64TargetLowering::isOffsetFoldingLegal(
8826 const GlobalAddressSDNode *GA) const {
8827 // Offsets are folded in the DAG combine rather than here so that we can
8828 // intelligently choose an offset based on the uses.
8832 bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
8833 bool OptForSize) const {
8834 bool IsLegal = false;
8835 // We can materialize #0.0 as fmov $Rd, XZR for 64-bit, 32-bit cases, and
8836 // 16-bit case when target has full fp16 support.
8837 // FIXME: We should be able to handle f128 as well with a clever lowering.
8838 const APInt ImmInt = Imm.bitcastToAPInt();
8840 IsLegal = AArch64_AM::getFP64Imm(ImmInt) != -1 || Imm.isPosZero();
8841 else if (VT == MVT::f32)
8842 IsLegal = AArch64_AM::getFP32Imm(ImmInt) != -1 || Imm.isPosZero();
8843 else if (VT == MVT::f16 && Subtarget->hasFullFP16())
8844 IsLegal = AArch64_AM::getFP16Imm(ImmInt) != -1 || Imm.isPosZero();
8845 // TODO: fmov h0, w0 is also legal, however on't have an isel pattern to
8846 // generate that fmov.
8848 // If we can not materialize in immediate field for fmov, check if the
8849 // value can be encoded as the immediate operand of a logical instruction.
8850 // The immediate value will be created with either MOVZ, MOVN, or ORR.
8851 if (!IsLegal && (VT == MVT::f64 || VT == MVT::f32)) {
8852 // The cost is actually exactly the same for mov+fmov vs. adrp+ldr;
8853 // however the mov+fmov sequence is always better because of the reduced
8854 // cache pressure. The timings are still the same if you consider
8855 // movw+movk+fmov vs. adrp+ldr (it's one instruction longer, but the
8856 // movw+movk is fused). So we limit up to 2 instrdduction at most.
8857 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
8858 AArch64_IMM::expandMOVImm(ImmInt.getZExtValue(), VT.getSizeInBits(),
8860 unsigned Limit = (OptForSize ? 1 : (Subtarget->hasFuseLiterals() ? 5 : 2));
8861 IsLegal = Insn.size() <= Limit;
8864 LLVM_DEBUG(dbgs() << (IsLegal ? "Legal " : "Illegal ") << VT.getEVTString()
8865 << " imm value: "; Imm.dump(););
8869 //===----------------------------------------------------------------------===//
8870 // AArch64 Optimization Hooks
8871 //===----------------------------------------------------------------------===//
8873 static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
8874 SDValue Operand, SelectionDAG &DAG,
8876 EVT VT = Operand.getValueType();
8877 if ((ST->hasNEON() &&
8878 (VT == MVT::f64 || VT == MVT::v1f64 || VT == MVT::v2f64 ||
8879 VT == MVT::f32 || VT == MVT::v1f32 || VT == MVT::v2f32 ||
8880 VT == MVT::v4f32)) ||
8882 (VT == MVT::nxv8f16 || VT == MVT::nxv4f32 || VT == MVT::nxv2f64))) {
8883 if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
8884 // For the reciprocal estimates, convergence is quadratic, so the number
8885 // of digits is doubled after each iteration. In ARMv8, the accuracy of
8886 // the initial estimate is 2^-8. Thus the number of extra steps to refine
8887 // the result for float (23 mantissa bits) is 2 and for double (52
8888 // mantissa bits) is 3.
8889 ExtraSteps = VT.getScalarType() == MVT::f64 ? 3 : 2;
8891 return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
8898 AArch64TargetLowering::getSqrtInputTest(SDValue Op, SelectionDAG &DAG,
8899 const DenormalMode &Mode) const {
8901 EVT VT = Op.getValueType();
8902 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
8903 SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
8904 return DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
8908 AArch64TargetLowering::getSqrtResultForDenormInput(SDValue Op,
8909 SelectionDAG &DAG) const {
8913 SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
8914 SelectionDAG &DAG, int Enabled,
8917 bool Reciprocal) const {
8918 if (Enabled == ReciprocalEstimate::Enabled ||
8919 (Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
8920 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
8923 EVT VT = Operand.getValueType();
8926 Flags.setAllowReassociation(true);
8928 // Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
8929 // AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
8930 for (int i = ExtraSteps; i > 0; --i) {
8931 SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
8933 Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
8934 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8937 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
8946 SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
8947 SelectionDAG &DAG, int Enabled,
8948 int &ExtraSteps) const {
8949 if (Enabled == ReciprocalEstimate::Enabled)
8950 if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
8953 EVT VT = Operand.getValueType();
8956 Flags.setAllowReassociation(true);
8958 // Newton reciprocal iteration: E * (2 - X * E)
8959 // AArch64 reciprocal iteration instruction: (2 - M * N)
8960 for (int i = ExtraSteps; i > 0; --i) {
8961 SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
8963 Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
8973 //===----------------------------------------------------------------------===//
8974 // AArch64 Inline Assembly Support
8975 //===----------------------------------------------------------------------===//
8977 // Table of Constraints
8978 // TODO: This is the current set of constraints supported by ARM for the
8979 // compiler, not all of them may make sense.
8981 // r - A general register
8982 // w - An FP/SIMD register of some size in the range v0-v31
8983 // x - An FP/SIMD register of some size in the range v0-v15
8984 // I - Constant that can be used with an ADD instruction
8985 // J - Constant that can be used with a SUB instruction
8986 // K - Constant that can be used with a 32-bit logical instruction
8987 // L - Constant that can be used with a 64-bit logical instruction
8988 // M - Constant that can be used as a 32-bit MOV immediate
8989 // N - Constant that can be used as a 64-bit MOV immediate
8990 // Q - A memory reference with base register and no offset
8991 // S - A symbolic address
8992 // Y - Floating point constant zero
8993 // Z - Integer constant zero
8995 // Note that general register operands will be output using their 64-bit x
8996 // register name, whatever the size of the variable, unless the asm operand
8997 // is prefixed by the %w modifier. Floating-point and SIMD register operands
8998 // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
9000 const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
9001 // At this point, we have to lower this constraint to something else, so we
9002 // lower it to an "r" or "w". However, by doing this we will force the result
9003 // to be in register, while the X constraint is much more permissive.
9005 // Although we are correct (we are free to emit anything, without
9006 // constraints), we might break use cases that would expect us to be more
9007 // efficient and emit something else.
9008 if (!Subtarget->hasFPARMv8())
9011 if (ConstraintVT.isFloatingPoint())
9014 if (ConstraintVT.isVector() &&
9015 (ConstraintVT.getSizeInBits() == 64 ||
9016 ConstraintVT.getSizeInBits() == 128))
9022 enum PredicateConstraint {
9028 static PredicateConstraint parsePredicateConstraint(StringRef Constraint) {
9029 PredicateConstraint P = PredicateConstraint::Invalid;
9030 if (Constraint == "Upa")
9031 P = PredicateConstraint::Upa;
9032 if (Constraint == "Upl")
9033 P = PredicateConstraint::Upl;
9037 /// getConstraintType - Given a constraint letter, return the type of
9038 /// constraint it is for this target.
9039 AArch64TargetLowering::ConstraintType
9040 AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
9041 if (Constraint.size() == 1) {
9042 switch (Constraint[0]) {
9048 return C_RegisterClass;
9049 // An address with a single base register. Due to the way we
9050 // currently handle addresses it is the same as 'r'.
9063 case 'S': // A symbolic address
9066 } else if (parsePredicateConstraint(Constraint) !=
9067 PredicateConstraint::Invalid)
9068 return C_RegisterClass;
9069 return TargetLowering::getConstraintType(Constraint);
9072 /// Examine constraint type and operand type and determine a weight value.
9073 /// This object must already have been set up with the operand type
9074 /// and the current alternative constraint selected.
9075 TargetLowering::ConstraintWeight
9076 AArch64TargetLowering::getSingleConstraintMatchWeight(
9077 AsmOperandInfo &info, const char *constraint) const {
9078 ConstraintWeight weight = CW_Invalid;
9079 Value *CallOperandVal = info.CallOperandVal;
9080 // If we don't have a value, we can't do a match,
9081 // but allow it at the lowest weight.
9082 if (!CallOperandVal)
9084 Type *type = CallOperandVal->getType();
9085 // Look at the constraint type.
9086 switch (*constraint) {
9088 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
9093 if (type->isFloatingPointTy() || type->isVectorTy())
9094 weight = CW_Register;
9097 weight = CW_Constant;
9100 if (parsePredicateConstraint(constraint) != PredicateConstraint::Invalid)
9101 weight = CW_Register;
9107 std::pair<unsigned, const TargetRegisterClass *>
9108 AArch64TargetLowering::getRegForInlineAsmConstraint(
9109 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
9110 if (Constraint.size() == 1) {
9111 switch (Constraint[0]) {
9113 if (VT.isScalableVector())
9114 return std::make_pair(0U, nullptr);
9115 if (Subtarget->hasLS64() && VT.getSizeInBits() == 512)
9116 return std::make_pair(0U, &AArch64::GPR64x8ClassRegClass);
9117 if (VT.getFixedSizeInBits() == 64)
9118 return std::make_pair(0U, &AArch64::GPR64commonRegClass);
9119 return std::make_pair(0U, &AArch64::GPR32commonRegClass);
9121 if (!Subtarget->hasFPARMv8())
9123 if (VT.isScalableVector()) {
9124 if (VT.getVectorElementType() != MVT::i1)
9125 return std::make_pair(0U, &AArch64::ZPRRegClass);
9126 return std::make_pair(0U, nullptr);
9128 uint64_t VTSize = VT.getFixedSizeInBits();
9130 return std::make_pair(0U, &AArch64::FPR16RegClass);
9132 return std::make_pair(0U, &AArch64::FPR32RegClass);
9134 return std::make_pair(0U, &AArch64::FPR64RegClass);
9136 return std::make_pair(0U, &AArch64::FPR128RegClass);
9139 // The instructions that this constraint is designed for can
9140 // only take 128-bit registers so just use that regclass.
9142 if (!Subtarget->hasFPARMv8())
9144 if (VT.isScalableVector())
9145 return std::make_pair(0U, &AArch64::ZPR_4bRegClass);
9146 if (VT.getSizeInBits() == 128)
9147 return std::make_pair(0U, &AArch64::FPR128_loRegClass);
9150 if (!Subtarget->hasFPARMv8())
9152 if (VT.isScalableVector())
9153 return std::make_pair(0U, &AArch64::ZPR_3bRegClass);
9157 PredicateConstraint PC = parsePredicateConstraint(Constraint);
9158 if (PC != PredicateConstraint::Invalid) {
9159 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
9160 return std::make_pair(0U, nullptr);
9161 bool restricted = (PC == PredicateConstraint::Upl);
9162 return restricted ? std::make_pair(0U, &AArch64::PPR_3bRegClass)
9163 : std::make_pair(0U, &AArch64::PPRRegClass);
9166 if (StringRef("{cc}").equals_insensitive(Constraint))
9167 return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
9169 // Use the default implementation in TargetLowering to convert the register
9170 // constraint into a member of a register class.
9171 std::pair<unsigned, const TargetRegisterClass *> Res;
9172 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
9174 // Not found as a standard register?
9176 unsigned Size = Constraint.size();
9177 if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
9178 tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
9180 bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
9181 if (!Failed && RegNo >= 0 && RegNo <= 31) {
9182 // v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
9183 // By default we'll emit v0-v31 for this unless there's a modifier where
9184 // we'll emit the correct register as well.
9185 if (VT != MVT::Other && VT.getSizeInBits() == 64) {
9186 Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
9187 Res.second = &AArch64::FPR64RegClass;
9189 Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
9190 Res.second = &AArch64::FPR128RegClass;
9196 if (Res.second && !Subtarget->hasFPARMv8() &&
9197 !AArch64::GPR32allRegClass.hasSubClassEq(Res.second) &&
9198 !AArch64::GPR64allRegClass.hasSubClassEq(Res.second))
9199 return std::make_pair(0U, nullptr);
9204 EVT AArch64TargetLowering::getAsmOperandValueType(const DataLayout &DL,
9206 bool AllowUnknown) const {
9207 if (Subtarget->hasLS64() && Ty->isIntegerTy(512))
9208 return EVT(MVT::i64x8);
9210 return TargetLowering::getAsmOperandValueType(DL, Ty, AllowUnknown);
9213 /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
9214 /// vector. If it is invalid, don't add anything to Ops.
9215 void AArch64TargetLowering::LowerAsmOperandForConstraint(
9216 SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
9217 SelectionDAG &DAG) const {
9220 // Currently only support length 1 constraints.
9221 if (Constraint.length() != 1)
9224 char ConstraintLetter = Constraint[0];
9225 switch (ConstraintLetter) {
9229 // This set of constraints deal with valid constants for various instructions.
9230 // Validate and return a target constant for them if we can.
9232 // 'z' maps to xzr or wzr so it needs an input of 0.
9233 if (!isNullConstant(Op))
9236 if (Op.getValueType() == MVT::i64)
9237 Result = DAG.getRegister(AArch64::XZR, MVT::i64);
9239 Result = DAG.getRegister(AArch64::WZR, MVT::i32);
9243 // An absolute symbolic address or label reference.
9244 if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
9245 Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
9246 GA->getValueType(0));
9247 } else if (const BlockAddressSDNode *BA =
9248 dyn_cast<BlockAddressSDNode>(Op)) {
9250 DAG.getTargetBlockAddress(BA->getBlockAddress(), BA->getValueType(0));
9262 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
9266 // Grab the value and do some validation.
9267 uint64_t CVal = C->getZExtValue();
9268 switch (ConstraintLetter) {
9269 // The I constraint applies only to simple ADD or SUB immediate operands:
9270 // i.e. 0 to 4095 with optional shift by 12
9271 // The J constraint applies only to ADD or SUB immediates that would be
9272 // valid when negated, i.e. if [an add pattern] were to be output as a SUB
9273 // instruction [or vice versa], in other words -1 to -4095 with optional
9274 // left shift by 12.
9276 if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
9280 uint64_t NVal = -C->getSExtValue();
9281 if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) {
9282 CVal = C->getSExtValue();
9287 // The K and L constraints apply *only* to logical immediates, including
9288 // what used to be the MOVI alias for ORR (though the MOVI alias has now
9289 // been removed and MOV should be used). So these constraints have to
9290 // distinguish between bit patterns that are valid 32-bit or 64-bit
9291 // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
9292 // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
9295 if (AArch64_AM::isLogicalImmediate(CVal, 32))
9299 if (AArch64_AM::isLogicalImmediate(CVal, 64))
9302 // The M and N constraints are a superset of K and L respectively, for use
9303 // with the MOV (immediate) alias. As well as the logical immediates they
9304 // also match 32 or 64-bit immediates that can be loaded either using a
9305 // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
9306 // (M) or 64-bit 0x1234000000000000 (N) etc.
9307 // As a note some of this code is liberally stolen from the asm parser.
9309 if (!isUInt<32>(CVal))
9311 if (AArch64_AM::isLogicalImmediate(CVal, 32))
9313 if ((CVal & 0xFFFF) == CVal)
9315 if ((CVal & 0xFFFF0000ULL) == CVal)
9317 uint64_t NCVal = ~(uint32_t)CVal;
9318 if ((NCVal & 0xFFFFULL) == NCVal)
9320 if ((NCVal & 0xFFFF0000ULL) == NCVal)
9325 if (AArch64_AM::isLogicalImmediate(CVal, 64))
9327 if ((CVal & 0xFFFFULL) == CVal)
9329 if ((CVal & 0xFFFF0000ULL) == CVal)
9331 if ((CVal & 0xFFFF00000000ULL) == CVal)
9333 if ((CVal & 0xFFFF000000000000ULL) == CVal)
9335 uint64_t NCVal = ~CVal;
9336 if ((NCVal & 0xFFFFULL) == NCVal)
9338 if ((NCVal & 0xFFFF0000ULL) == NCVal)
9340 if ((NCVal & 0xFFFF00000000ULL) == NCVal)
9342 if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
9350 // All assembler immediates are 64-bit integers.
9351 Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
9355 if (Result.getNode()) {
9356 Ops.push_back(Result);
9360 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
9363 //===----------------------------------------------------------------------===//
9364 // AArch64 Advanced SIMD Support
9365 //===----------------------------------------------------------------------===//
9367 /// WidenVector - Given a value in the V64 register class, produce the
9368 /// equivalent value in the V128 register class.
9369 static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
9370 EVT VT = V64Reg.getValueType();
9371 unsigned NarrowSize = VT.getVectorNumElements();
9372 MVT EltTy = VT.getVectorElementType().getSimpleVT();
9373 MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
9376 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
9377 V64Reg, DAG.getConstant(0, DL, MVT::i64));
9380 /// getExtFactor - Determine the adjustment factor for the position when
9381 /// generating an "extract from vector registers" instruction.
9382 static unsigned getExtFactor(SDValue &V) {
9383 EVT EltType = V.getValueType().getVectorElementType();
9384 return EltType.getSizeInBits() / 8;
9387 /// NarrowVector - Given a value in the V128 register class, produce the
9388 /// equivalent value in the V64 register class.
9389 static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
9390 EVT VT = V128Reg.getValueType();
9391 unsigned WideSize = VT.getVectorNumElements();
9392 MVT EltTy = VT.getVectorElementType().getSimpleVT();
9393 MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
9396 return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
9399 // Gather data to see if the operation can be modelled as a
9400 // shuffle in combination with VEXTs.
9401 SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
9402 SelectionDAG &DAG) const {
9403 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
9404 LLVM_DEBUG(dbgs() << "AArch64TargetLowering::ReconstructShuffle\n");
9406 EVT VT = Op.getValueType();
9407 assert(!VT.isScalableVector() &&
9408 "Scalable vectors cannot be used with ISD::BUILD_VECTOR");
9409 unsigned NumElts = VT.getVectorNumElements();
9411 struct ShuffleSourceInfo {
9416 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
9417 // be compatible with the shuffle we intend to construct. As a result
9418 // ShuffleVec will be some sliding window into the original Vec.
9421 // Code should guarantee that element i in Vec starts at element "WindowBase
9422 // + i * WindowScale in ShuffleVec".
9426 ShuffleSourceInfo(SDValue Vec)
9427 : Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
9428 ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}
9430 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
9433 // First gather all vectors used as an immediate source for this BUILD_VECTOR
9435 SmallVector<ShuffleSourceInfo, 2> Sources;
9436 for (unsigned i = 0; i < NumElts; ++i) {
9437 SDValue V = Op.getOperand(i);
9440 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9441 !isa<ConstantSDNode>(V.getOperand(1)) ||
9442 V.getOperand(0).getValueType().isScalableVector()) {
9444 dbgs() << "Reshuffle failed: "
9445 "a shuffle can only come from building a vector from "
9446 "various elements of other fixed-width vectors, provided "
9447 "their indices are constant\n");
9451 // Add this element source to the list if it's not already there.
9452 SDValue SourceVec = V.getOperand(0);
9453 auto Source = find(Sources, SourceVec);
9454 if (Source == Sources.end())
9455 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
9457 // Update the minimum and maximum lane number seen.
9458 unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
9459 Source->MinElt = std::min(Source->MinElt, EltNo);
9460 Source->MaxElt = std::max(Source->MaxElt, EltNo);
9463 // If we have 3 or 4 sources, try to generate a TBL, which will at least be
9464 // better than moving to/from gpr registers for larger vectors.
9465 if ((Sources.size() == 3 || Sources.size() == 4) && NumElts > 4) {
9466 // Construct a mask for the tbl. We may need to adjust the index for types
9468 SmallVector<unsigned, 16> Mask;
9469 unsigned OutputFactor = VT.getScalarSizeInBits() / 8;
9470 for (unsigned I = 0; I < NumElts; ++I) {
9471 SDValue V = Op.getOperand(I);
9473 for (unsigned OF = 0; OF < OutputFactor; OF++)
9477 // Set the Mask lanes adjusted for the size of the input and output
9478 // lanes. The Mask is always i8, so it will set OutputFactor lanes per
9479 // output element, adjusted in their positions per input and output types.
9480 unsigned Lane = V.getConstantOperandVal(1);
9481 for (unsigned S = 0; S < Sources.size(); S++) {
9482 if (V.getOperand(0) == Sources[S].Vec) {
9483 unsigned InputSize = Sources[S].Vec.getScalarValueSizeInBits();
9484 unsigned InputBase = 16 * S + Lane * InputSize / 8;
9485 for (unsigned OF = 0; OF < OutputFactor; OF++)
9486 Mask.push_back(InputBase + OF);
9492 // Construct the tbl3/tbl4 out of an intrinsic, the sources converted to
9493 // v16i8, and the TBLMask
9494 SmallVector<SDValue, 16> TBLOperands;
9495 TBLOperands.push_back(DAG.getConstant(Sources.size() == 3
9496 ? Intrinsic::aarch64_neon_tbl3
9497 : Intrinsic::aarch64_neon_tbl4,
9499 for (unsigned i = 0; i < Sources.size(); i++) {
9500 SDValue Src = Sources[i].Vec;
9501 EVT SrcVT = Src.getValueType();
9502 Src = DAG.getBitcast(SrcVT.is64BitVector() ? MVT::v8i8 : MVT::v16i8, Src);
9503 assert((SrcVT.is64BitVector() || SrcVT.is128BitVector()) &&
9504 "Expected a legally typed vector");
9505 if (SrcVT.is64BitVector())
9506 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, Src,
9507 DAG.getUNDEF(MVT::v8i8));
9508 TBLOperands.push_back(Src);
9511 SmallVector<SDValue, 16> TBLMask;
9512 for (unsigned i = 0; i < Mask.size(); i++)
9513 TBLMask.push_back(DAG.getConstant(Mask[i], dl, MVT::i32));
9514 assert((Mask.size() == 8 || Mask.size() == 16) &&
9515 "Expected a v8i8 or v16i8 Mask");
9516 TBLOperands.push_back(
9517 DAG.getBuildVector(Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, dl, TBLMask));
9520 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl,
9521 Mask.size() == 8 ? MVT::v8i8 : MVT::v16i8, TBLOperands);
9522 return DAG.getBitcast(VT, Shuffle);
9525 if (Sources.size() > 2) {
9526 LLVM_DEBUG(dbgs() << "Reshuffle failed: currently only do something "
9527 << "sensible when at most two source vectors are "
9532 // Find out the smallest element size among result and two sources, and use
9533 // it as element size to build the shuffle_vector.
9534 EVT SmallestEltTy = VT.getVectorElementType();
9535 for (auto &Source : Sources) {
9536 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
9537 if (SrcEltTy.bitsLT(SmallestEltTy)) {
9538 SmallestEltTy = SrcEltTy;
9541 unsigned ResMultiplier =
9542 VT.getScalarSizeInBits() / SmallestEltTy.getFixedSizeInBits();
9543 uint64_t VTSize = VT.getFixedSizeInBits();
9544 NumElts = VTSize / SmallestEltTy.getFixedSizeInBits();
9545 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
9547 // If the source vector is too wide or too narrow, we may nevertheless be able
9548 // to construct a compatible shuffle either by concatenating it with UNDEF or
9549 // extracting a suitable range of elements.
9550 for (auto &Src : Sources) {
9551 EVT SrcVT = Src.ShuffleVec.getValueType();
9553 TypeSize SrcVTSize = SrcVT.getSizeInBits();
9554 if (SrcVTSize == TypeSize::Fixed(VTSize))
9557 // This stage of the search produces a source with the same element type as
9558 // the original, but with a total width matching the BUILD_VECTOR output.
9559 EVT EltVT = SrcVT.getVectorElementType();
9560 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
9561 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
9563 if (SrcVTSize.getFixedValue() < VTSize) {
9564 assert(2 * SrcVTSize == VTSize);
9565 // We can pad out the smaller vector for free, so if it's part of a
9568 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
9569 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
9573 if (SrcVTSize.getFixedValue() != 2 * VTSize) {
9575 dbgs() << "Reshuffle failed: result vector too small to extract\n");
9579 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
9581 dbgs() << "Reshuffle failed: span too large for a VEXT to cope\n");
9585 if (Src.MinElt >= NumSrcElts) {
9586 // The extraction can just take the second half
9588 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
9589 DAG.getConstant(NumSrcElts, dl, MVT::i64));
9590 Src.WindowBase = -NumSrcElts;
9591 } else if (Src.MaxElt < NumSrcElts) {
9592 // The extraction can just take the first half
9594 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
9595 DAG.getConstant(0, dl, MVT::i64));
9597 // An actual VEXT is needed
9599 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
9600 DAG.getConstant(0, dl, MVT::i64));
9602 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
9603 DAG.getConstant(NumSrcElts, dl, MVT::i64));
9604 unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);
9606 if (!SrcVT.is64BitVector()) {
9608 dbgs() << "Reshuffle failed: don't know how to lower AArch64ISD::EXT "
9609 "for SVE vectors.");
9613 Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
9615 DAG.getConstant(Imm, dl, MVT::i32));
9616 Src.WindowBase = -Src.MinElt;
9620 // Another possible incompatibility occurs from the vector element types. We
9621 // can fix this by bitcasting the source vectors to the same type we intend
9623 for (auto &Src : Sources) {
9624 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
9625 if (SrcEltTy == SmallestEltTy)
9627 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
9628 Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
9630 SrcEltTy.getFixedSizeInBits() / SmallestEltTy.getFixedSizeInBits();
9631 Src.WindowBase *= Src.WindowScale;
9634 // Final check before we try to actually produce a shuffle.
9635 LLVM_DEBUG(for (auto Src
9637 assert(Src.ShuffleVec.getValueType() == ShuffleVT););
9639 // The stars all align, our next step is to produce the mask for the shuffle.
9640 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
9641 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
9642 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
9643 SDValue Entry = Op.getOperand(i);
9644 if (Entry.isUndef())
9647 auto Src = find(Sources, Entry.getOperand(0));
9648 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
9650 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
9651 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
9653 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
9654 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
9655 VT.getScalarSizeInBits());
9656 int LanesDefined = BitsDefined / BitsPerShuffleLane;
9658 // This source is expected to fill ResMultiplier lanes of the final shuffle,
9659 // starting at the appropriate offset.
9660 int *LaneMask = &Mask[i * ResMultiplier];
9662 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
9663 ExtractBase += NumElts * (Src - Sources.begin());
9664 for (int j = 0; j < LanesDefined; ++j)
9665 LaneMask[j] = ExtractBase + j;
9668 // Final check before we try to produce nonsense...
9669 if (!isShuffleMaskLegal(Mask, ShuffleVT)) {
9670 LLVM_DEBUG(dbgs() << "Reshuffle failed: illegal shuffle mask\n");
9674 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
9675 for (unsigned i = 0; i < Sources.size(); ++i)
9676 ShuffleOps[i] = Sources[i].ShuffleVec;
9678 SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
9679 ShuffleOps[1], Mask);
9680 SDValue V = DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
9682 LLVM_DEBUG(dbgs() << "Reshuffle, creating node: "; Shuffle.dump();
9683 dbgs() << "Reshuffle, creating node: "; V.dump(););
9688 // check if an EXT instruction can handle the shuffle mask when the
9689 // vector sources of the shuffle are the same.
9690 static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
9691 unsigned NumElts = VT.getVectorNumElements();
9693 // Assume that the first shuffle index is not UNDEF. Fail if it is.
9699 // If this is a VEXT shuffle, the immediate value is the index of the first
9700 // element. The other shuffle indices must be the successive elements after
9702 unsigned ExpectedElt = Imm;
9703 for (unsigned i = 1; i < NumElts; ++i) {
9704 // Increment the expected index. If it wraps around, just follow it
9705 // back to index zero and keep going.
9707 if (ExpectedElt == NumElts)
9711 continue; // ignore UNDEF indices
9712 if (ExpectedElt != static_cast<unsigned>(M[i]))
9719 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
9720 // v4i32s. This is really a truncate, which we can construct out of (legal)
9721 // concats and truncate nodes.
9722 static SDValue ReconstructTruncateFromBuildVector(SDValue V, SelectionDAG &DAG) {
9723 if (V.getValueType() != MVT::v16i8)
9725 assert(V.getNumOperands() == 16 && "Expected 16 operands on the BUILDVECTOR");
9727 for (unsigned X = 0; X < 4; X++) {
9728 // Check the first item in each group is an extract from lane 0 of a v4i32
9730 SDValue BaseExt = V.getOperand(X * 4);
9731 if (BaseExt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9732 (BaseExt.getOperand(0).getValueType() != MVT::v4i16 &&
9733 BaseExt.getOperand(0).getValueType() != MVT::v4i32) ||
9734 !isa<ConstantSDNode>(BaseExt.getOperand(1)) ||
9735 BaseExt.getConstantOperandVal(1) != 0)
9737 SDValue Base = BaseExt.getOperand(0);
9738 // And check the other items are extracts from the same vector.
9739 for (unsigned Y = 1; Y < 4; Y++) {
9740 SDValue Ext = V.getOperand(X * 4 + Y);
9741 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
9742 Ext.getOperand(0) != Base ||
9743 !isa<ConstantSDNode>(Ext.getOperand(1)) ||
9744 Ext.getConstantOperandVal(1) != Y)
9749 // Turn the buildvector into a series of truncates and concates, which will
9750 // become uzip1's. Any v4i32s we found get truncated to v4i16, which are
9751 // concat together to produce 2 v8i16. These are both truncated and concat
9754 SDValue Trunc[4] = {
9755 V.getOperand(0).getOperand(0), V.getOperand(4).getOperand(0),
9756 V.getOperand(8).getOperand(0), V.getOperand(12).getOperand(0)};
9757 for (int I = 0; I < 4; I++)
9758 if (Trunc[I].getValueType() == MVT::v4i32)
9759 Trunc[I] = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i16, Trunc[I]);
9761 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[0], Trunc[1]);
9763 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, Trunc[2], Trunc[3]);
9764 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat0);
9765 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, Concat1);
9766 return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Trunc0, Trunc1);
9769 /// Check if a vector shuffle corresponds to a DUP instructions with a larger
9770 /// element width than the vector lane type. If that is the case the function
9771 /// returns true and writes the value of the DUP instruction lane operand into
9773 static bool isWideDUPMask(ArrayRef<int> M, EVT VT, unsigned BlockSize,
9774 unsigned &DupLaneOp) {
9775 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
9776 "Only possible block sizes for wide DUP are: 16, 32, 64");
9778 if (BlockSize <= VT.getScalarSizeInBits())
9780 if (BlockSize % VT.getScalarSizeInBits() != 0)
9782 if (VT.getSizeInBits() % BlockSize != 0)
9785 size_t SingleVecNumElements = VT.getVectorNumElements();
9786 size_t NumEltsPerBlock = BlockSize / VT.getScalarSizeInBits();
9787 size_t NumBlocks = VT.getSizeInBits() / BlockSize;
9789 // We are looking for masks like
9790 // [0, 1, 0, 1] or [2, 3, 2, 3] or [4, 5, 6, 7, 4, 5, 6, 7] where any element
9791 // might be replaced by 'undefined'. BlockIndices will eventually contain
9792 // lane indices of the duplicated block (i.e. [0, 1], [2, 3] and [4, 5, 6, 7]
9793 // for the above examples)
9794 SmallVector<int, 8> BlockElts(NumEltsPerBlock, -1);
9795 for (size_t BlockIndex = 0; BlockIndex < NumBlocks; BlockIndex++)
9796 for (size_t I = 0; I < NumEltsPerBlock; I++) {
9797 int Elt = M[BlockIndex * NumEltsPerBlock + I];
9800 // For now we don't support shuffles that use the second operand
9801 if ((unsigned)Elt >= SingleVecNumElements)
9803 if (BlockElts[I] < 0)
9805 else if (BlockElts[I] != Elt)
9809 // We found a candidate block (possibly with some undefs). It must be a
9810 // sequence of consecutive integers starting with a value divisible by
9811 // NumEltsPerBlock with some values possibly replaced by undef-s.
9813 // Find first non-undef element
9814 auto FirstRealEltIter = find_if(BlockElts, [](int Elt) { return Elt >= 0; });
9815 assert(FirstRealEltIter != BlockElts.end() &&
9816 "Shuffle with all-undefs must have been caught by previous cases, "
9818 if (FirstRealEltIter == BlockElts.end()) {
9823 // Index of FirstRealElt in BlockElts
9824 size_t FirstRealIndex = FirstRealEltIter - BlockElts.begin();
9826 if ((unsigned)*FirstRealEltIter < FirstRealIndex)
9828 // BlockElts[0] must have the following value if it isn't undef:
9829 size_t Elt0 = *FirstRealEltIter - FirstRealIndex;
9831 // Check the first element
9832 if (Elt0 % NumEltsPerBlock != 0)
9834 // Check that the sequence indeed consists of consecutive integers (modulo
9836 for (size_t I = 0; I < NumEltsPerBlock; I++)
9837 if (BlockElts[I] >= 0 && (unsigned)BlockElts[I] != Elt0 + I)
9840 DupLaneOp = Elt0 / NumEltsPerBlock;
9844 // check if an EXT instruction can handle the shuffle mask when the
9845 // vector sources of the shuffle are different.
9846 static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
9848 // Look for the first non-undef element.
9849 const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
9851 // Benefit form APInt to handle overflow when calculating expected element.
9852 unsigned NumElts = VT.getVectorNumElements();
9853 unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
9854 APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
9855 // The following shuffle indices must be the successive elements after the
9856 // first real element.
9857 bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
9858 return Elt != ExpectedElt++ && Elt != -1;
9863 // The index of an EXT is the first element if it is not UNDEF.
9864 // Watch out for the beginning UNDEFs. The EXT index should be the expected
9865 // value of the first element. E.g.
9866 // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
9867 // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
9868 // ExpectedElt is the last mask index plus 1.
9869 Imm = ExpectedElt.getZExtValue();
9871 // There are two difference cases requiring to reverse input vectors.
9872 // For example, for vector <4 x i32> we have the following cases,
9873 // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
9874 // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
9875 // For both cases, we finally use mask <5, 6, 7, 0>, which requires
9876 // to reverse two input vectors.
9885 /// isREVMask - Check if a vector shuffle corresponds to a REV
9886 /// instruction with the specified blocksize. (The order of the elements
9887 /// within each block of the vector is reversed.)
9888 static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
9889 assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
9890 "Only possible block sizes for REV are: 16, 32, 64");
9892 unsigned EltSz = VT.getScalarSizeInBits();
9896 unsigned NumElts = VT.getVectorNumElements();
9897 unsigned BlockElts = M[0] + 1;
9898 // If the first shuffle index is UNDEF, be optimistic.
9900 BlockElts = BlockSize / EltSz;
9902 if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
9905 for (unsigned i = 0; i < NumElts; ++i) {
9907 continue; // ignore UNDEF indices
9908 if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
9915 static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9916 unsigned NumElts = VT.getVectorNumElements();
9917 if (NumElts % 2 != 0)
9919 WhichResult = (M[0] == 0 ? 0 : 1);
9920 unsigned Idx = WhichResult * NumElts / 2;
9921 for (unsigned i = 0; i != NumElts; i += 2) {
9922 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
9923 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
9931 static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9932 unsigned NumElts = VT.getVectorNumElements();
9933 WhichResult = (M[0] == 0 ? 0 : 1);
9934 for (unsigned i = 0; i != NumElts; ++i) {
9936 continue; // ignore UNDEF indices
9937 if ((unsigned)M[i] != 2 * i + WhichResult)
9944 static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9945 unsigned NumElts = VT.getVectorNumElements();
9946 if (NumElts % 2 != 0)
9948 WhichResult = (M[0] == 0 ? 0 : 1);
9949 for (unsigned i = 0; i < NumElts; i += 2) {
9950 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
9951 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
9957 /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
9958 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
9959 /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
9960 static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9961 unsigned NumElts = VT.getVectorNumElements();
9962 if (NumElts % 2 != 0)
9964 WhichResult = (M[0] == 0 ? 0 : 1);
9965 unsigned Idx = WhichResult * NumElts / 2;
9966 for (unsigned i = 0; i != NumElts; i += 2) {
9967 if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
9968 (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
9976 /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
9977 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
9978 /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
9979 static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9980 unsigned Half = VT.getVectorNumElements() / 2;
9981 WhichResult = (M[0] == 0 ? 0 : 1);
9982 for (unsigned j = 0; j != 2; ++j) {
9983 unsigned Idx = WhichResult;
9984 for (unsigned i = 0; i != Half; ++i) {
9985 int MIdx = M[i + j * Half];
9986 if (MIdx >= 0 && (unsigned)MIdx != Idx)
9995 /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
9996 /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
9997 /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
9998 static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
9999 unsigned NumElts = VT.getVectorNumElements();
10000 if (NumElts % 2 != 0)
10002 WhichResult = (M[0] == 0 ? 0 : 1);
10003 for (unsigned i = 0; i < NumElts; i += 2) {
10004 if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
10005 (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
10011 static bool isINSMask(ArrayRef<int> M, int NumInputElements,
10012 bool &DstIsLeft, int &Anomaly) {
10013 if (M.size() != static_cast<size_t>(NumInputElements))
10016 int NumLHSMatch = 0, NumRHSMatch = 0;
10017 int LastLHSMismatch = -1, LastRHSMismatch = -1;
10019 for (int i = 0; i < NumInputElements; ++i) {
10029 LastLHSMismatch = i;
10031 if (M[i] == i + NumInputElements)
10034 LastRHSMismatch = i;
10037 if (NumLHSMatch == NumInputElements - 1) {
10039 Anomaly = LastLHSMismatch;
10041 } else if (NumRHSMatch == NumInputElements - 1) {
10043 Anomaly = LastRHSMismatch;
10050 static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
10051 if (VT.getSizeInBits() != 128)
10054 unsigned NumElts = VT.getVectorNumElements();
10056 for (int I = 0, E = NumElts / 2; I != E; I++) {
10061 int Offset = NumElts / 2;
10062 for (int I = NumElts / 2, E = NumElts; I != E; I++) {
10063 if (Mask[I] != I + SplitLHS * Offset)
10070 static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
10072 EVT VT = Op.getValueType();
10073 SDValue V0 = Op.getOperand(0);
10074 SDValue V1 = Op.getOperand(1);
10075 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
10077 if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
10078 VT.getVectorElementType() != V1.getValueType().getVectorElementType())
10081 bool SplitV0 = V0.getValueSizeInBits() == 128;
10083 if (!isConcatMask(Mask, VT, SplitV0))
10086 EVT CastVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
10088 V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
10089 DAG.getConstant(0, DL, MVT::i64));
10091 if (V1.getValueSizeInBits() == 128) {
10092 V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
10093 DAG.getConstant(0, DL, MVT::i64));
10095 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
10098 /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
10099 /// the specified operations to build the shuffle. ID is the perfect-shuffle
10100 //ID, V1 and V2 are the original shuffle inputs. PFEntry is the Perfect shuffle
10101 //table entry and LHS/RHS are the immediate inputs for this stage of the
10103 static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
10104 SDValue V2, unsigned PFEntry, SDValue LHS,
10105 SDValue RHS, SelectionDAG &DAG,
10107 unsigned OpNum = (PFEntry >> 26) & 0x0F;
10108 unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
10109 unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
10112 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
10121 OP_VUZPL, // VUZP, left result
10122 OP_VUZPR, // VUZP, right result
10123 OP_VZIPL, // VZIP, left result
10124 OP_VZIPR, // VZIP, right result
10125 OP_VTRNL, // VTRN, left result
10126 OP_VTRNR, // VTRN, right result
10127 OP_MOVLANE // Move lane. RHSID is the lane to move into
10130 if (OpNum == OP_COPY) {
10131 if (LHSID == (1 * 9 + 2) * 9 + 3)
10133 assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
10137 if (OpNum == OP_MOVLANE) {
10138 // Decompose a PerfectShuffle ID to get the Mask for lane Elt
10139 auto getPFIDLane = [](unsigned ID, int Elt) -> int {
10140 assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
10146 return (ID % 9 == 8) ? -1 : ID % 9;
10149 // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
10150 // get the lane to move from from the PFID, which is always from the
10151 // original vectors (V1 or V2).
10152 SDValue OpLHS = GeneratePerfectShuffle(
10153 LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
10154 EVT VT = OpLHS.getValueType();
10155 assert(RHSID < 8 && "Expected a lane index for RHSID!");
10156 unsigned ExtLane = 0;
10159 // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
10160 // convert into a higher type.
10162 int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
10164 MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
10165 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
10166 ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
10167 Input = MaskElt < 2 ? V1 : V2;
10168 if (VT.getScalarSizeInBits() == 16) {
10169 Input = DAG.getBitcast(MVT::v2f32, Input);
10170 OpLHS = DAG.getBitcast(MVT::v2f32, OpLHS);
10172 assert(VT.getScalarSizeInBits() == 32 &&
10173 "Expected 16 or 32 bit shuffle elemements");
10174 Input = DAG.getBitcast(MVT::v2f64, Input);
10175 OpLHS = DAG.getBitcast(MVT::v2f64, OpLHS);
10178 int MaskElt = getPFIDLane(ID, RHSID);
10179 assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
10180 ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
10181 Input = MaskElt < 4 ? V1 : V2;
10182 // Be careful about creating illegal types. Use f16 instead of i16.
10183 if (VT == MVT::v4i16) {
10184 Input = DAG.getBitcast(MVT::v4f16, Input);
10185 OpLHS = DAG.getBitcast(MVT::v4f16, OpLHS);
10188 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
10189 Input.getValueType().getVectorElementType(),
10190 Input, DAG.getVectorIdxConstant(ExtLane, dl));
10192 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Input.getValueType(), OpLHS,
10193 Ext, DAG.getVectorIdxConstant(RHSID & 0x3, dl));
10194 return DAG.getBitcast(VT, Ins);
10197 SDValue OpLHS, OpRHS;
10198 OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
10200 OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
10202 EVT VT = OpLHS.getValueType();
10206 llvm_unreachable("Unknown shuffle opcode!");
10208 // VREV divides the vector in half and swaps within the half.
10209 if (VT.getVectorElementType() == MVT::i32 ||
10210 VT.getVectorElementType() == MVT::f32)
10211 return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
10212 // vrev <4 x i16> -> REV32
10213 if (VT.getVectorElementType() == MVT::i16 ||
10214 VT.getVectorElementType() == MVT::f16 ||
10215 VT.getVectorElementType() == MVT::bf16)
10216 return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
10217 // vrev <4 x i8> -> REV16
10218 assert(VT.getVectorElementType() == MVT::i8);
10219 return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
10224 EVT EltTy = VT.getVectorElementType();
10226 if (EltTy == MVT::i8)
10227 Opcode = AArch64ISD::DUPLANE8;
10228 else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
10229 Opcode = AArch64ISD::DUPLANE16;
10230 else if (EltTy == MVT::i32 || EltTy == MVT::f32)
10231 Opcode = AArch64ISD::DUPLANE32;
10232 else if (EltTy == MVT::i64 || EltTy == MVT::f64)
10233 Opcode = AArch64ISD::DUPLANE64;
10235 llvm_unreachable("Invalid vector element type?");
10237 if (VT.getSizeInBits() == 64)
10238 OpLHS = WidenVector(OpLHS, DAG);
10239 SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
10240 return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
10245 unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
10246 return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
10247 DAG.getConstant(Imm, dl, MVT::i32));
10250 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
10253 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
10256 return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
10259 return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
10262 return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
10265 return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
10270 static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
10271 SelectionDAG &DAG) {
10272 // Check to see if we can use the TBL instruction.
10273 SDValue V1 = Op.getOperand(0);
10274 SDValue V2 = Op.getOperand(1);
10277 EVT EltVT = Op.getValueType().getVectorElementType();
10278 unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
10281 if (V1.isUndef() || isZerosVector(V1.getNode())) {
10286 // If the V2 source is undef or zero then we can use a tbl1, as tbl1 will fill
10287 // out of range values with 0s. We do need to make sure that any out-of-range
10288 // values are really out-of-range for a v16i8 vector.
10289 bool IsUndefOrZero = V2.isUndef() || isZerosVector(V2.getNode());
10290 MVT IndexVT = MVT::v8i8;
10291 unsigned IndexLen = 8;
10292 if (Op.getValueSizeInBits() == 128) {
10293 IndexVT = MVT::v16i8;
10297 SmallVector<SDValue, 8> TBLMask;
10298 for (int Val : ShuffleMask) {
10299 for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
10300 unsigned Offset = Byte + Val * BytesPerElt;
10302 Offset = Offset < IndexLen ? Offset + IndexLen : Offset - IndexLen;
10303 if (IsUndefOrZero && Offset >= IndexLen)
10305 TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
10309 SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
10310 SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
10313 if (IsUndefOrZero) {
10315 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
10316 Shuffle = DAG.getNode(
10317 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10318 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
10319 DAG.getBuildVector(IndexVT, DL,
10320 makeArrayRef(TBLMask.data(), IndexLen)));
10322 if (IndexLen == 8) {
10323 V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
10324 Shuffle = DAG.getNode(
10325 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10326 DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
10327 DAG.getBuildVector(IndexVT, DL,
10328 makeArrayRef(TBLMask.data(), IndexLen)));
10330 // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
10331 // cannot currently represent the register constraints on the input
10332 // table registers.
10333 // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
10334 // DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
10336 Shuffle = DAG.getNode(
10337 ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
10338 DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
10339 V2Cst, DAG.getBuildVector(IndexVT, DL,
10340 makeArrayRef(TBLMask.data(), IndexLen)));
10343 return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
10346 static unsigned getDUPLANEOp(EVT EltType) {
10347 if (EltType == MVT::i8)
10348 return AArch64ISD::DUPLANE8;
10349 if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
10350 return AArch64ISD::DUPLANE16;
10351 if (EltType == MVT::i32 || EltType == MVT::f32)
10352 return AArch64ISD::DUPLANE32;
10353 if (EltType == MVT::i64 || EltType == MVT::f64)
10354 return AArch64ISD::DUPLANE64;
10356 llvm_unreachable("Invalid vector element type?");
10359 static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
10360 unsigned Opcode, SelectionDAG &DAG) {
10361 // Try to eliminate a bitcasted extract subvector before a DUPLANE.
10362 auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
10363 // Match: dup (bitcast (extract_subv X, C)), LaneC
10364 if (BitCast.getOpcode() != ISD::BITCAST ||
10365 BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
10368 // The extract index must align in the destination type. That may not
10369 // happen if the bitcast is from narrow to wide type.
10370 SDValue Extract = BitCast.getOperand(0);
10371 unsigned ExtIdx = Extract.getConstantOperandVal(1);
10372 unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
10373 unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
10374 unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
10375 if (ExtIdxInBits % CastedEltBitWidth != 0)
10378 // Can't handle cases where vector size is not 128-bit
10379 if (!Extract.getOperand(0).getValueType().is128BitVector())
10382 // Update the lane value by offsetting with the scaled extract index.
10383 LaneC += ExtIdxInBits / CastedEltBitWidth;
10385 // Determine the casted vector type of the wide vector input.
10386 // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
10388 // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
10389 // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
10390 unsigned SrcVecNumElts =
10391 Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
10392 CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
10397 if (getScaledOffsetDup(V, Lane, CastVT)) {
10398 V = DAG.getBitcast(CastVT, V.getOperand(0).getOperand(0));
10399 } else if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
10400 V.getOperand(0).getValueType().is128BitVector()) {
10401 // The lane is incremented by the index of the extract.
10402 // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
10403 Lane += V.getConstantOperandVal(1);
10404 V = V.getOperand(0);
10405 } else if (V.getOpcode() == ISD::CONCAT_VECTORS) {
10406 // The lane is decremented if we are splatting from the 2nd operand.
10407 // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
10408 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
10409 Lane -= Idx * VT.getVectorNumElements() / 2;
10410 V = WidenVector(V.getOperand(Idx), DAG);
10411 } else if (VT.getSizeInBits() == 64) {
10412 // Widen the operand to 128-bit register with undef.
10413 V = WidenVector(V, DAG);
10415 return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
10418 // Return true if we can get a new shuffle mask by checking the parameter mask
10419 // array to test whether every two adjacent mask values are continuous and
10420 // starting from an even number.
10421 static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
10422 SmallVectorImpl<int> &NewMask) {
10423 unsigned NumElts = VT.getVectorNumElements();
10424 if (NumElts % 2 != 0)
10428 for (unsigned i = 0; i < NumElts; i += 2) {
10432 // If both elements are undef, new mask is undef too.
10433 if (M0 == -1 && M1 == -1) {
10434 NewMask.push_back(-1);
10438 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
10439 NewMask.push_back(M1 / 2);
10443 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
10444 NewMask.push_back(M0 / 2);
10452 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
10456 // Try to widen element type to get a new mask value for a better permutation
10457 // sequence, so that we can use NEON shuffle instructions, such as zip1/2,
10458 // UZP1/2, TRN1/2, REV, INS, etc.
10460 // shufflevector <4 x i32> %a, <4 x i32> %b,
10461 // <4 x i32> <i32 6, i32 7, i32 2, i32 3>
10462 // is equivalent to:
10463 // shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
10464 // Finally, we can get:
10465 // mov v0.d[0], v1.d[1]
10466 static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
10468 EVT VT = Op.getValueType();
10469 EVT ScalarVT = VT.getVectorElementType();
10470 unsigned ElementSize = ScalarVT.getFixedSizeInBits();
10471 SDValue V0 = Op.getOperand(0);
10472 SDValue V1 = Op.getOperand(1);
10473 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
10475 // If combining adjacent elements, like two i16's -> i32, two i32's -> i64 ...
10476 // We need to make sure the wider element type is legal. Thus, ElementSize
10477 // should be not larger than 32 bits, and i1 type should also be excluded.
10478 if (ElementSize > 32 || ElementSize == 1)
10481 SmallVector<int, 8> NewMask;
10482 if (isWideTypeMask(Mask, VT, NewMask)) {
10483 MVT NewEltVT = VT.isFloatingPoint()
10484 ? MVT::getFloatingPointVT(ElementSize * 2)
10485 : MVT::getIntegerVT(ElementSize * 2);
10486 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
10487 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
10488 V0 = DAG.getBitcast(NewVT, V0);
10489 V1 = DAG.getBitcast(NewVT, V1);
10490 return DAG.getBitcast(VT,
10491 DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
10498 SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
10499 SelectionDAG &DAG) const {
10501 EVT VT = Op.getValueType();
10503 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
10505 if (useSVEForFixedLengthVectorVT(VT))
10506 return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
10508 // Convert shuffles that are directly supported on NEON to target-specific
10509 // DAG nodes, instead of keeping them as shuffles and matching them again
10510 // during code selection. This is more efficient and avoids the possibility
10511 // of inconsistencies between legalization and selection.
10512 ArrayRef<int> ShuffleMask = SVN->getMask();
10514 SDValue V1 = Op.getOperand(0);
10515 SDValue V2 = Op.getOperand(1);
10517 assert(V1.getValueType() == VT && "Unexpected VECTOR_SHUFFLE type!");
10518 assert(ShuffleMask.size() == VT.getVectorNumElements() &&
10519 "Unexpected VECTOR_SHUFFLE mask size!");
10521 if (SVN->isSplat()) {
10522 int Lane = SVN->getSplatIndex();
10523 // If this is undef splat, generate it via "just" vdup, if possible.
10527 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
10528 return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
10530 // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
10531 // constant. If so, we can just reference the lane's definition directly.
10532 if (V1.getOpcode() == ISD::BUILD_VECTOR &&
10533 !isa<ConstantSDNode>(V1.getOperand(Lane)))
10534 return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
10536 // Otherwise, duplicate from the lane of the input vector.
10537 unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
10538 return constructDup(V1, Lane, dl, VT, Opcode, DAG);
10541 // Check if the mask matches a DUP for a wider element
10542 for (unsigned LaneSize : {64U, 32U, 16U}) {
10544 if (isWideDUPMask(ShuffleMask, VT, LaneSize, Lane)) {
10545 unsigned Opcode = LaneSize == 64 ? AArch64ISD::DUPLANE64
10546 : LaneSize == 32 ? AArch64ISD::DUPLANE32
10547 : AArch64ISD::DUPLANE16;
10548 // Cast V1 to an integer vector with required lane size
10549 MVT NewEltTy = MVT::getIntegerVT(LaneSize);
10550 unsigned NewEltCount = VT.getSizeInBits() / LaneSize;
10551 MVT NewVecTy = MVT::getVectorVT(NewEltTy, NewEltCount);
10552 V1 = DAG.getBitcast(NewVecTy, V1);
10553 // Constuct the DUP instruction
10554 V1 = constructDup(V1, Lane, dl, NewVecTy, Opcode, DAG);
10555 // Cast back to the original type
10556 return DAG.getBitcast(VT, V1);
10560 if (isREVMask(ShuffleMask, VT, 64))
10561 return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
10562 if (isREVMask(ShuffleMask, VT, 32))
10563 return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
10564 if (isREVMask(ShuffleMask, VT, 16))
10565 return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
10567 if (((VT.getVectorNumElements() == 8 && VT.getScalarSizeInBits() == 16) ||
10568 (VT.getVectorNumElements() == 16 && VT.getScalarSizeInBits() == 8)) &&
10569 ShuffleVectorInst::isReverseMask(ShuffleMask)) {
10570 SDValue Rev = DAG.getNode(AArch64ISD::REV64, dl, VT, V1);
10571 return DAG.getNode(AArch64ISD::EXT, dl, VT, Rev, Rev,
10572 DAG.getConstant(8, dl, MVT::i32));
10575 bool ReverseEXT = false;
10577 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
10580 Imm *= getExtFactor(V1);
10581 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
10582 DAG.getConstant(Imm, dl, MVT::i32));
10583 } else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
10584 Imm *= getExtFactor(V1);
10585 return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
10586 DAG.getConstant(Imm, dl, MVT::i32));
10589 unsigned WhichResult;
10590 if (isZIPMask(ShuffleMask, VT, WhichResult)) {
10591 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
10592 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
10594 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
10595 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
10596 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
10598 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
10599 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
10600 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
10603 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
10604 unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
10605 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
10607 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
10608 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
10609 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
10611 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
10612 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
10613 return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
10616 if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
10621 int NumInputElements = V1.getValueType().getVectorNumElements();
10622 if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
10623 SDValue DstVec = DstIsLeft ? V1 : V2;
10624 SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);
10626 SDValue SrcVec = V1;
10627 int SrcLane = ShuffleMask[Anomaly];
10628 if (SrcLane >= NumInputElements) {
10630 SrcLane -= VT.getVectorNumElements();
10632 SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);
10634 EVT ScalarVT = VT.getVectorElementType();
10636 if (ScalarVT.getFixedSizeInBits() < 32 && ScalarVT.isInteger())
10637 ScalarVT = MVT::i32;
10639 return DAG.getNode(
10640 ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
10641 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
10645 if (SDValue NewSD = tryWidenMaskForShuffle(Op, DAG))
10648 // If the shuffle is not directly supported and it has 4 elements, use
10649 // the PerfectShuffle-generated table to synthesize it from other shuffles.
10650 unsigned NumElts = VT.getVectorNumElements();
10651 if (NumElts == 4) {
10652 unsigned PFIndexes[4];
10653 for (unsigned i = 0; i != 4; ++i) {
10654 if (ShuffleMask[i] < 0)
10657 PFIndexes[i] = ShuffleMask[i];
10660 // Compute the index in the perfect shuffle table.
10661 unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
10662 PFIndexes[2] * 9 + PFIndexes[3];
10663 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
10664 return GeneratePerfectShuffle(PFTableIndex, V1, V2, PFEntry, V1, V2, DAG,
10668 return GenerateTBL(Op, ShuffleMask, DAG);
10671 SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
10672 SelectionDAG &DAG) const {
10673 EVT VT = Op.getValueType();
10675 if (useSVEForFixedLengthVectorVT(VT))
10676 return LowerToScalableOp(Op, DAG);
10678 assert(VT.isScalableVector() && VT.getVectorElementType() == MVT::i1 &&
10679 "Unexpected vector type!");
10681 // We can handle the constant cases during isel.
10682 if (isa<ConstantSDNode>(Op.getOperand(0)))
10685 // There isn't a natural way to handle the general i1 case, so we use some
10686 // trickery with whilelo.
10688 SDValue SplatVal = DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, MVT::i64);
10689 SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, SplatVal,
10690 DAG.getValueType(MVT::i1));
10692 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
10693 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
10694 if (VT == MVT::nxv1i1)
10695 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::nxv1i1,
10696 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::nxv2i1, ID,
10699 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, ID, Zero, SplatVal);
10702 SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
10703 SelectionDAG &DAG) const {
10706 EVT VT = Op.getValueType();
10707 if (!isTypeLegal(VT) || !VT.isScalableVector())
10710 // Current lowering only supports the SVE-ACLE types.
10711 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
10714 // The DUPQ operation is indepedent of element type so normalise to i64s.
10715 SDValue Idx128 = Op.getOperand(2);
10717 // DUPQ can be used when idx is in range.
10718 auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
10719 if (CIdx && (CIdx->getZExtValue() <= 3)) {
10720 SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
10721 return DAG.getNode(AArch64ISD::DUPLANE128, DL, VT, Op.getOperand(1), CI);
10724 SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
10726 // The ACLE says this must produce the same result as:
10727 // svtbl(data, svadd_x(svptrue_b64(),
10728 // svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
10730 SDValue One = DAG.getConstant(1, DL, MVT::i64);
10731 SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
10733 // create the vector 0,1,0,1,...
10734 SDValue SV = DAG.getStepVector(DL, MVT::nxv2i64);
10735 SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
10737 // create the vector idx64,idx64+1,idx64,idx64+1,...
10738 SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
10739 SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
10740 SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
10742 // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
10743 SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
10744 return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
10748 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
10749 APInt &UndefBits) {
10750 EVT VT = BVN->getValueType(0);
10751 APInt SplatBits, SplatUndef;
10752 unsigned SplatBitSize;
10754 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
10755 unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
10757 for (unsigned i = 0; i < NumSplats; ++i) {
10758 CnstBits <<= SplatBitSize;
10759 UndefBits <<= SplatBitSize;
10760 CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
10761 UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
10770 // Try 64-bit splatted SIMD immediate.
10771 static SDValue tryAdvSIMDModImm64(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
10772 const APInt &Bits) {
10773 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
10774 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
10775 EVT VT = Op.getValueType();
10776 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v2i64 : MVT::f64;
10778 if (AArch64_AM::isAdvSIMDModImmType10(Value)) {
10779 Value = AArch64_AM::encodeAdvSIMDModImmType10(Value);
10782 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
10783 DAG.getConstant(Value, dl, MVT::i32));
10784 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
10791 // Try 32-bit splatted SIMD immediate.
10792 static SDValue tryAdvSIMDModImm32(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
10794 const SDValue *LHS = nullptr) {
10795 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
10796 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
10797 EVT VT = Op.getValueType();
10798 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
10799 bool isAdvSIMDModImm = false;
10802 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType1(Value))) {
10803 Value = AArch64_AM::encodeAdvSIMDModImmType1(Value);
10806 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType2(Value))) {
10807 Value = AArch64_AM::encodeAdvSIMDModImmType2(Value);
10810 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType3(Value))) {
10811 Value = AArch64_AM::encodeAdvSIMDModImmType3(Value);
10814 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType4(Value))) {
10815 Value = AArch64_AM::encodeAdvSIMDModImmType4(Value);
10819 if (isAdvSIMDModImm) {
10824 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
10825 DAG.getConstant(Value, dl, MVT::i32),
10826 DAG.getConstant(Shift, dl, MVT::i32));
10828 Mov = DAG.getNode(NewOp, dl, MovTy,
10829 DAG.getConstant(Value, dl, MVT::i32),
10830 DAG.getConstant(Shift, dl, MVT::i32));
10832 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
10839 // Try 16-bit splatted SIMD immediate.
10840 static SDValue tryAdvSIMDModImm16(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
10842 const SDValue *LHS = nullptr) {
10843 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
10844 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
10845 EVT VT = Op.getValueType();
10846 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
10847 bool isAdvSIMDModImm = false;
10850 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType5(Value))) {
10851 Value = AArch64_AM::encodeAdvSIMDModImmType5(Value);
10854 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType6(Value))) {
10855 Value = AArch64_AM::encodeAdvSIMDModImmType6(Value);
10859 if (isAdvSIMDModImm) {
10864 Mov = DAG.getNode(NewOp, dl, MovTy, *LHS,
10865 DAG.getConstant(Value, dl, MVT::i32),
10866 DAG.getConstant(Shift, dl, MVT::i32));
10868 Mov = DAG.getNode(NewOp, dl, MovTy,
10869 DAG.getConstant(Value, dl, MVT::i32),
10870 DAG.getConstant(Shift, dl, MVT::i32));
10872 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
10879 // Try 32-bit splatted SIMD immediate with shifted ones.
10880 static SDValue tryAdvSIMDModImm321s(unsigned NewOp, SDValue Op,
10881 SelectionDAG &DAG, const APInt &Bits) {
10882 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
10883 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
10884 EVT VT = Op.getValueType();
10885 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
10886 bool isAdvSIMDModImm = false;
10889 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType7(Value))) {
10890 Value = AArch64_AM::encodeAdvSIMDModImmType7(Value);
10893 else if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType8(Value))) {
10894 Value = AArch64_AM::encodeAdvSIMDModImmType8(Value);
10898 if (isAdvSIMDModImm) {
10900 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
10901 DAG.getConstant(Value, dl, MVT::i32),
10902 DAG.getConstant(Shift, dl, MVT::i32));
10903 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
10910 // Try 8-bit splatted SIMD immediate.
10911 static SDValue tryAdvSIMDModImm8(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
10912 const APInt &Bits) {
10913 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
10914 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
10915 EVT VT = Op.getValueType();
10916 MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
10918 if (AArch64_AM::isAdvSIMDModImmType9(Value)) {
10919 Value = AArch64_AM::encodeAdvSIMDModImmType9(Value);
10922 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
10923 DAG.getConstant(Value, dl, MVT::i32));
10924 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
10931 // Try FP splatted SIMD immediate.
10932 static SDValue tryAdvSIMDModImmFP(unsigned NewOp, SDValue Op, SelectionDAG &DAG,
10933 const APInt &Bits) {
10934 if (Bits.getHiBits(64) == Bits.getLoBits(64)) {
10935 uint64_t Value = Bits.zextOrTrunc(64).getZExtValue();
10936 EVT VT = Op.getValueType();
10937 bool isWide = (VT.getSizeInBits() == 128);
10939 bool isAdvSIMDModImm = false;
10941 if ((isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType11(Value))) {
10942 Value = AArch64_AM::encodeAdvSIMDModImmType11(Value);
10943 MovTy = isWide ? MVT::v4f32 : MVT::v2f32;
10946 (isAdvSIMDModImm = AArch64_AM::isAdvSIMDModImmType12(Value))) {
10947 Value = AArch64_AM::encodeAdvSIMDModImmType12(Value);
10948 MovTy = MVT::v2f64;
10951 if (isAdvSIMDModImm) {
10953 SDValue Mov = DAG.getNode(NewOp, dl, MovTy,
10954 DAG.getConstant(Value, dl, MVT::i32));
10955 return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
10962 // Specialized code to quickly find if PotentialBVec is a BuildVector that
10963 // consists of only the same constant int value, returned in reference arg
10965 static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
10966 uint64_t &ConstVal) {
10967 BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
10970 ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
10973 EVT VT = Bvec->getValueType(0);
10974 unsigned NumElts = VT.getVectorNumElements();
10975 for (unsigned i = 1; i < NumElts; ++i)
10976 if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
10978 ConstVal = FirstElt->getZExtValue();
10982 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
10983 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
10984 // BUILD_VECTORs with constant element C1, C2 is a constant, and:
10985 // - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
10986 // - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
10987 // The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
10988 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
10989 EVT VT = N->getValueType(0);
10991 if (!VT.isVector())
10999 SDValue FirstOp = N->getOperand(0);
11000 unsigned FirstOpc = FirstOp.getOpcode();
11001 SDValue SecondOp = N->getOperand(1);
11002 unsigned SecondOpc = SecondOp.getOpcode();
11004 // Is one of the operands an AND or a BICi? The AND may have been optimised to
11005 // a BICi in order to use an immediate instead of a register.
11006 // Is the other operand an shl or lshr? This will have been turned into:
11007 // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
11008 if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
11009 (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
11013 } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
11014 (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
11020 bool IsAnd = And.getOpcode() == ISD::AND;
11021 bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
11023 // Is the shift amount constant?
11024 ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
11030 // Is the and mask vector all constant?
11031 if (!isAllConstantBuildVector(And.getOperand(1), C1))
11034 // Reconstruct the corresponding AND immediate from the two BICi immediates.
11035 ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
11036 ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
11037 assert(C1nodeImm && C1nodeShift);
11038 C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
11041 // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
11042 // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
11043 // how much one can shift elements of a particular size?
11044 uint64_t C2 = C2node->getZExtValue();
11045 unsigned ElemSizeInBits = VT.getScalarSizeInBits();
11046 if (C2 > ElemSizeInBits)
11049 APInt C1AsAPInt(ElemSizeInBits, C1);
11050 APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
11051 : APInt::getLowBitsSet(ElemSizeInBits, C2);
11052 if (C1AsAPInt != RequiredC1)
11055 SDValue X = And.getOperand(0);
11056 SDValue Y = Shift.getOperand(0);
11058 unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
11059 SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
11061 LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
11062 LLVM_DEBUG(N->dump(&DAG));
11063 LLVM_DEBUG(dbgs() << "into: \n");
11064 LLVM_DEBUG(ResultSLI->dump(&DAG));
11070 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
11071 SelectionDAG &DAG) const {
11072 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
11073 return LowerToScalableOp(Op, DAG);
11075 // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
11076 if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
11079 EVT VT = Op.getValueType();
11081 SDValue LHS = Op.getOperand(0);
11082 BuildVectorSDNode *BVN =
11083 dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
11085 // OR commutes, so try swapping the operands.
11086 LHS = Op.getOperand(1);
11087 BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
11092 APInt DefBits(VT.getSizeInBits(), 0);
11093 APInt UndefBits(VT.getSizeInBits(), 0);
11094 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
11097 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
11099 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
11103 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::ORRi, Op, DAG,
11104 UndefBits, &LHS)) ||
11105 (NewOp = tryAdvSIMDModImm16(AArch64ISD::ORRi, Op, DAG,
11110 // We can always fall back to a non-immediate OR.
11114 // Normalize the operands of BUILD_VECTOR. The value of constant operands will
11115 // be truncated to fit element width.
11116 static SDValue NormalizeBuildVector(SDValue Op,
11117 SelectionDAG &DAG) {
11118 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
11120 EVT VT = Op.getValueType();
11121 EVT EltTy= VT.getVectorElementType();
11123 if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16)
11126 SmallVector<SDValue, 16> Ops;
11127 for (SDValue Lane : Op->ops()) {
11128 // For integer vectors, type legalization would have promoted the
11129 // operands already. Otherwise, if Op is a floating-point splat
11130 // (with operands cast to integers), then the only possibilities
11131 // are constants and UNDEFs.
11132 if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
11133 APInt LowBits(EltTy.getSizeInBits(),
11134 CstLane->getZExtValue());
11135 Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
11136 } else if (Lane.getNode()->isUndef()) {
11137 Lane = DAG.getUNDEF(MVT::i32);
11139 assert(Lane.getValueType() == MVT::i32 &&
11140 "Unexpected BUILD_VECTOR operand type");
11142 Ops.push_back(Lane);
11144 return DAG.getBuildVector(VT, dl, Ops);
11147 static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
11148 EVT VT = Op.getValueType();
11150 APInt DefBits(VT.getSizeInBits(), 0);
11151 APInt UndefBits(VT.getSizeInBits(), 0);
11152 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
11153 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
11155 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
11156 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11157 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
11158 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11159 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
11160 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
11163 DefBits = ~DefBits;
11164 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
11165 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
11166 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
11169 DefBits = UndefBits;
11170 if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
11171 (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11172 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
11173 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
11174 (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
11175 (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
11178 DefBits = ~UndefBits;
11179 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
11180 (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
11181 (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
11188 SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
11189 SelectionDAG &DAG) const {
11190 EVT VT = Op.getValueType();
11192 if (useSVEForFixedLengthVectorVT(VT)) {
11193 if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
11195 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
11196 SDValue Start = DAG.getConstant(SeqInfo->first, DL, ContainerVT);
11197 SDValue Steps = DAG.getStepVector(DL, ContainerVT, SeqInfo->second);
11198 SDValue Seq = DAG.getNode(ISD::ADD, DL, ContainerVT, Start, Steps);
11199 return convertFromScalableVector(DAG, Op.getValueType(), Seq);
11202 // Revert to common legalisation for all other variants.
11206 // Try to build a simple constant vector.
11207 Op = NormalizeBuildVector(Op, DAG);
11208 if (VT.isInteger()) {
11209 // Certain vector constants, used to express things like logical NOT and
11210 // arithmetic NEG, are passed through unmodified. This allows special
11211 // patterns for these operations to match, which will lower these constants
11212 // to whatever is proven necessary.
11213 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
11214 if (BVN->isConstant())
11215 if (ConstantSDNode *Const = BVN->getConstantSplatNode()) {
11216 unsigned BitSize = VT.getVectorElementType().getSizeInBits();
11218 Const->getAPIntValue().zextOrTrunc(BitSize).getZExtValue());
11219 if (Val.isZero() || Val.isAllOnes())
11224 if (SDValue V = ConstantBuildVector(Op, DAG))
11227 // Scan through the operands to find some interesting properties we can
11229 // 1) If only one value is used, we can use a DUP, or
11230 // 2) if only the low element is not undef, we can just insert that, or
11231 // 3) if only one constant value is used (w/ some non-constant lanes),
11232 // we can splat the constant value into the whole vector then fill
11233 // in the non-constant lanes.
11234 // 4) FIXME: If different constant values are used, but we can intelligently
11235 // select the values we'll be overwriting for the non-constant
11236 // lanes such that we can directly materialize the vector
11237 // some other way (MOVI, e.g.), we can be sneaky.
11238 // 5) if all operands are EXTRACT_VECTOR_ELT, check for VUZP.
11240 unsigned NumElts = VT.getVectorNumElements();
11241 bool isOnlyLowElement = true;
11242 bool usesOnlyOneValue = true;
11243 bool usesOnlyOneConstantValue = true;
11244 bool isConstant = true;
11245 bool AllLanesExtractElt = true;
11246 unsigned NumConstantLanes = 0;
11247 unsigned NumDifferentLanes = 0;
11248 unsigned NumUndefLanes = 0;
11250 SDValue ConstantValue;
11251 for (unsigned i = 0; i < NumElts; ++i) {
11252 SDValue V = Op.getOperand(i);
11253 if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
11254 AllLanesExtractElt = false;
11260 isOnlyLowElement = false;
11261 if (!isIntOrFPConstant(V))
11262 isConstant = false;
11264 if (isIntOrFPConstant(V)) {
11265 ++NumConstantLanes;
11266 if (!ConstantValue.getNode())
11268 else if (ConstantValue != V)
11269 usesOnlyOneConstantValue = false;
11272 if (!Value.getNode())
11274 else if (V != Value) {
11275 usesOnlyOneValue = false;
11276 ++NumDifferentLanes;
11280 if (!Value.getNode()) {
11282 dbgs() << "LowerBUILD_VECTOR: value undefined, creating undef node\n");
11283 return DAG.getUNDEF(VT);
11286 // Convert BUILD_VECTOR where all elements but the lowest are undef into
11287 // SCALAR_TO_VECTOR, except for when we have a single-element constant vector
11288 // as SimplifyDemandedBits will just turn that back into BUILD_VECTOR.
11289 if (isOnlyLowElement && !(NumElts == 1 && isIntOrFPConstant(Value))) {
11290 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: only low element used, creating 1 "
11291 "SCALAR_TO_VECTOR node\n");
11292 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
11295 if (AllLanesExtractElt) {
11296 SDNode *Vector = nullptr;
11299 // Check whether the extract elements match the Even pattern <0,2,4,...> or
11300 // the Odd pattern <1,3,5,...>.
11301 for (unsigned i = 0; i < NumElts; ++i) {
11302 SDValue V = Op.getOperand(i);
11303 const SDNode *N = V.getNode();
11304 if (!isa<ConstantSDNode>(N->getOperand(1)))
11306 SDValue N0 = N->getOperand(0);
11308 // All elements are extracted from the same vector.
11310 Vector = N0.getNode();
11311 // Check that the type of EXTRACT_VECTOR_ELT matches the type of
11313 if (VT.getVectorElementType() !=
11314 N0.getValueType().getVectorElementType())
11316 } else if (Vector != N0.getNode()) {
11322 // Extracted values are either at Even indices <0,2,4,...> or at Odd
11323 // indices <1,3,5,...>.
11324 uint64_t Val = N->getConstantOperandVal(1);
11325 if (Val == 2 * i) {
11329 if (Val - 1 == 2 * i) {
11334 // Something does not match: abort.
11341 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
11342 DAG.getConstant(0, dl, MVT::i64));
11344 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SDValue(Vector, 0),
11345 DAG.getConstant(NumElts, dl, MVT::i64));
11348 return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), LHS,
11351 return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), LHS,
11356 // Use DUP for non-constant splats. For f32 constant splats, reduce to
11357 // i32 and try again.
11358 if (usesOnlyOneValue) {
11360 if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
11361 Value.getValueType() != VT) {
11363 dbgs() << "LowerBUILD_VECTOR: use DUP for non-constant splats\n");
11364 return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
11367 // This is actually a DUPLANExx operation, which keeps everything vectory.
11369 SDValue Lane = Value.getOperand(1);
11370 Value = Value.getOperand(0);
11371 if (Value.getValueSizeInBits() == 64) {
11373 dbgs() << "LowerBUILD_VECTOR: DUPLANE works on 128-bit vectors, "
11375 Value = WidenVector(Value, DAG);
11378 unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
11379 return DAG.getNode(Opcode, dl, VT, Value, Lane);
11382 if (VT.getVectorElementType().isFloatingPoint()) {
11383 SmallVector<SDValue, 8> Ops;
11384 EVT EltTy = VT.getVectorElementType();
11385 assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
11386 EltTy == MVT::f64) && "Unsupported floating-point vector type");
11388 dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
11389 "BITCASTS, and try again\n");
11390 MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
11391 for (unsigned i = 0; i < NumElts; ++i)
11392 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
11393 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
11394 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
11395 LLVM_DEBUG(dbgs() << "LowerBUILD_VECTOR: trying to lower new vector: ";
11397 Val = LowerBUILD_VECTOR(Val, DAG);
11399 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
11403 // If we need to insert a small number of different non-constant elements and
11404 // the vector width is sufficiently large, prefer using DUP with the common
11405 // value and INSERT_VECTOR_ELT for the different lanes. If DUP is preferred,
11406 // skip the constant lane handling below.
11407 bool PreferDUPAndInsert =
11408 !isConstant && NumDifferentLanes >= 1 &&
11409 NumDifferentLanes < ((NumElts - NumUndefLanes) / 2) &&
11410 NumDifferentLanes >= NumConstantLanes;
11412 // If there was only one constant value used and for more than one lane,
11413 // start by splatting that value, then replace the non-constant lanes. This
11414 // is better than the default, which will perform a separate initialization
11416 if (!PreferDUPAndInsert && NumConstantLanes > 0 && usesOnlyOneConstantValue) {
11417 // Firstly, try to materialize the splat constant.
11418 SDValue Vec = DAG.getSplatBuildVector(VT, dl, ConstantValue),
11419 Val = ConstantBuildVector(Vec, DAG);
11421 // Otherwise, materialize the constant and splat it.
11422 Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
11423 DAG.ReplaceAllUsesWith(Vec.getNode(), &Val);
11426 // Now insert the non-constant lanes.
11427 for (unsigned i = 0; i < NumElts; ++i) {
11428 SDValue V = Op.getOperand(i);
11429 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
11430 if (!isIntOrFPConstant(V))
11431 // Note that type legalization likely mucked about with the VT of the
11432 // source operand, so we may have to convert it here before inserting.
11433 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
11438 // This will generate a load from the constant pool.
11441 dbgs() << "LowerBUILD_VECTOR: all elements are constant, use default "
11446 // Detect patterns of a0,a1,a2,a3,b0,b1,b2,b3,c0,c1,c2,c3,d0,d1,d2,d3 from
11447 // v4i32s. This is really a truncate, which we can construct out of (legal)
11448 // concats and truncate nodes.
11449 if (SDValue M = ReconstructTruncateFromBuildVector(Op, DAG))
11452 // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
11453 if (NumElts >= 4) {
11454 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
11458 if (PreferDUPAndInsert) {
11459 // First, build a constant vector with the common element.
11460 SmallVector<SDValue, 8> Ops(NumElts, Value);
11461 SDValue NewVector = LowerBUILD_VECTOR(DAG.getBuildVector(VT, dl, Ops), DAG);
11462 // Next, insert the elements that do not match the common value.
11463 for (unsigned I = 0; I < NumElts; ++I)
11464 if (Op.getOperand(I) != Value)
11466 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, NewVector,
11467 Op.getOperand(I), DAG.getConstant(I, dl, MVT::i64));
11472 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
11473 // know the default expansion would otherwise fall back on something even
11474 // worse. For a vector with one or two non-undef values, that's
11475 // scalar_to_vector for the elements followed by a shuffle (provided the
11476 // shuffle is valid for the target) and materialization element by element
11477 // on the stack followed by a load for everything else.
11478 if (!isConstant && !usesOnlyOneValue) {
11480 dbgs() << "LowerBUILD_VECTOR: alternatives failed, creating sequence "
11481 "of INSERT_VECTOR_ELT\n");
11483 SDValue Vec = DAG.getUNDEF(VT);
11484 SDValue Op0 = Op.getOperand(0);
11487 // Use SCALAR_TO_VECTOR for lane zero to
11488 // a) Avoid a RMW dependency on the full vector register, and
11489 // b) Allow the register coalescer to fold away the copy if the
11490 // value is already in an S or D register, and we're forced to emit an
11491 // INSERT_SUBREG that we can't fold anywhere.
11493 // We also allow types like i8 and i16 which are illegal scalar but legal
11494 // vector element types. After type-legalization the inserted value is
11495 // extended (i32) and it is safe to cast them to the vector type by ignoring
11496 // the upper bits of the lowest lane (e.g. v8i8, v4i16).
11497 if (!Op0.isUndef()) {
11498 LLVM_DEBUG(dbgs() << "Creating node for op0, it is not undefined:\n");
11499 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
11502 LLVM_DEBUG(if (i < NumElts) dbgs()
11503 << "Creating nodes for the other vector elements:\n";);
11504 for (; i < NumElts; ++i) {
11505 SDValue V = Op.getOperand(i);
11508 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
11509 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
11515 dbgs() << "LowerBUILD_VECTOR: use default expansion, failed to find "
11516 "better alternative\n");
11520 SDValue AArch64TargetLowering::LowerCONCAT_VECTORS(SDValue Op,
11521 SelectionDAG &DAG) const {
11522 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
11523 return LowerFixedLengthConcatVectorsToSVE(Op, DAG);
11525 assert(Op.getValueType().isScalableVector() &&
11526 isTypeLegal(Op.getValueType()) &&
11527 "Expected legal scalable vector type!");
11529 if (isTypeLegal(Op.getOperand(0).getValueType())) {
11530 unsigned NumOperands = Op->getNumOperands();
11531 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
11532 "Unexpected number of operands in CONCAT_VECTORS");
11534 if (NumOperands == 2)
11537 // Concat each pair of subvectors and pack into the lower half of the array.
11538 SmallVector<SDValue> ConcatOps(Op->op_begin(), Op->op_end());
11539 while (ConcatOps.size() > 1) {
11540 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
11541 SDValue V1 = ConcatOps[I];
11542 SDValue V2 = ConcatOps[I + 1];
11543 EVT SubVT = V1.getValueType();
11544 EVT PairVT = SubVT.getDoubleNumVectorElementsVT(*DAG.getContext());
11546 DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), PairVT, V1, V2);
11548 ConcatOps.resize(ConcatOps.size() / 2);
11550 return ConcatOps[0];
11556 SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
11557 SelectionDAG &DAG) const {
11558 assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
11560 if (useSVEForFixedLengthVectorVT(Op.getValueType()))
11561 return LowerFixedLengthInsertVectorElt(Op, DAG);
11563 // Check for non-constant or out of range lane.
11564 EVT VT = Op.getOperand(0).getValueType();
11566 if (VT.getScalarType() == MVT::i1) {
11567 EVT VectorVT = getPromotedVTForPredicate(VT);
11569 SDValue ExtendedVector =
11570 DAG.getAnyExtOrTrunc(Op.getOperand(0), DL, VectorVT);
11571 SDValue ExtendedValue =
11572 DAG.getAnyExtOrTrunc(Op.getOperand(1), DL,
11573 VectorVT.getScalarType().getSizeInBits() < 32
11575 : VectorVT.getScalarType());
11577 DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VectorVT, ExtendedVector,
11578 ExtendedValue, Op.getOperand(2));
11579 return DAG.getAnyExtOrTrunc(ExtendedVector, DL, VT);
11582 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
11583 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
11586 // Insertion/extraction are legal for V128 types.
11587 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
11588 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
11589 VT == MVT::v8f16 || VT == MVT::v8bf16)
11592 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
11593 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
11597 // For V64 types, we perform insertion by expanding the value
11598 // to a V128 type and perform the insertion on that.
11600 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
11601 EVT WideTy = WideVec.getValueType();
11603 SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
11604 Op.getOperand(1), Op.getOperand(2));
11605 // Re-narrow the resultant vector.
11606 return NarrowVector(Node, DAG);
11610 AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
11611 SelectionDAG &DAG) const {
11612 assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
11613 EVT VT = Op.getOperand(0).getValueType();
11615 if (VT.getScalarType() == MVT::i1) {
11616 // We can't directly extract from an SVE predicate; extend it first.
11617 // (This isn't the only possible lowering, but it's straightforward.)
11618 EVT VectorVT = getPromotedVTForPredicate(VT);
11621 DAG.getNode(ISD::ANY_EXTEND, DL, VectorVT, Op.getOperand(0));
11622 MVT ExtractTy = VectorVT == MVT::nxv2i64 ? MVT::i64 : MVT::i32;
11623 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractTy,
11624 Extend, Op.getOperand(1));
11625 return DAG.getAnyExtOrTrunc(Extract, DL, Op.getValueType());
11628 if (useSVEForFixedLengthVectorVT(VT))
11629 return LowerFixedLengthExtractVectorElt(Op, DAG);
11631 // Check for non-constant or out of range lane.
11632 ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
11633 if (!CI || CI->getZExtValue() >= VT.getVectorNumElements())
11636 // Insertion/extraction are legal for V128 types.
11637 if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
11638 VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
11639 VT == MVT::v8f16 || VT == MVT::v8bf16)
11642 if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
11643 VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
11647 // For V64 types, we perform extraction by expanding the value
11648 // to a V128 type and perform the extraction on that.
11650 SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
11651 EVT WideTy = WideVec.getValueType();
11653 EVT ExtrTy = WideTy.getVectorElementType();
11654 if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
11657 // For extractions, we just return the result directly.
11658 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
11662 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
11663 SelectionDAG &DAG) const {
11664 assert(Op.getValueType().isFixedLengthVector() &&
11665 "Only cases that extract a fixed length vector are supported!");
11667 EVT InVT = Op.getOperand(0).getValueType();
11668 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
11669 unsigned Size = Op.getValueSizeInBits();
11671 // If we don't have legal types yet, do nothing
11672 if (!DAG.getTargetLoweringInfo().isTypeLegal(InVT))
11675 if (InVT.isScalableVector()) {
11676 // This will be matched by custom code during ISelDAGToDAG.
11677 if (Idx == 0 && isPackedVectorType(InVT, DAG))
11683 // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
11684 if (Idx == 0 && InVT.getSizeInBits() <= 128)
11687 // If this is extracting the upper 64-bits of a 128-bit vector, we match
11689 if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64 &&
11690 InVT.getSizeInBits() == 128)
11693 if (useSVEForFixedLengthVectorVT(InVT)) {
11696 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
11698 convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
11700 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ContainerVT, NewInVec,
11701 NewInVec, DAG.getConstant(Idx, DL, MVT::i64));
11702 return convertFromScalableVector(DAG, Op.getValueType(), Splice);
11708 SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
11709 SelectionDAG &DAG) const {
11710 assert(Op.getValueType().isScalableVector() &&
11711 "Only expect to lower inserts into scalable vectors!");
11713 EVT InVT = Op.getOperand(1).getValueType();
11714 unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
11716 SDValue Vec0 = Op.getOperand(0);
11717 SDValue Vec1 = Op.getOperand(1);
11719 EVT VT = Op.getValueType();
11721 if (InVT.isScalableVector()) {
11722 if (!isTypeLegal(VT))
11725 // Break down insert_subvector into simpler parts.
11726 if (VT.getVectorElementType() == MVT::i1) {
11727 unsigned NumElts = VT.getVectorMinNumElements();
11728 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
11731 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
11732 DAG.getVectorIdxConstant(0, DL));
11733 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, Vec0,
11734 DAG.getVectorIdxConstant(NumElts / 2, DL));
11735 if (Idx < (NumElts / 2)) {
11736 SDValue NewLo = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Lo, Vec1,
11737 DAG.getVectorIdxConstant(Idx, DL));
11738 return DAG.getNode(AArch64ISD::UZP1, DL, VT, NewLo, Hi);
11741 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, HalfVT, Hi, Vec1,
11742 DAG.getVectorIdxConstant(Idx - (NumElts / 2), DL));
11743 return DAG.getNode(AArch64ISD::UZP1, DL, VT, Lo, NewHi);
11747 // Ensure the subvector is half the size of the main vector.
11748 if (VT.getVectorElementCount() != (InVT.getVectorElementCount() * 2))
11751 // Here narrow and wide refers to the vector element types. After "casting"
11752 // both vectors must have the same bit length and so because the subvector
11753 // has fewer elements, those elements need to be bigger.
11754 EVT NarrowVT = getPackedSVEVectorVT(VT.getVectorElementCount());
11755 EVT WideVT = getPackedSVEVectorVT(InVT.getVectorElementCount());
11757 // NOP cast operands to the largest legal vector of the same element count.
11758 if (VT.isFloatingPoint()) {
11759 Vec0 = getSVESafeBitCast(NarrowVT, Vec0, DAG);
11760 Vec1 = getSVESafeBitCast(WideVT, Vec1, DAG);
11762 // Legal integer vectors are already their largest so Vec0 is fine as is.
11763 Vec1 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Vec1);
11766 // To replace the top/bottom half of vector V with vector SubV we widen the
11767 // preserved half of V, concatenate this to SubV (the order depending on the
11768 // half being replaced) and then narrow the result.
11771 SDValue HiVec0 = DAG.getNode(AArch64ISD::UUNPKHI, DL, WideVT, Vec0);
11772 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, Vec1, HiVec0);
11774 assert(Idx == InVT.getVectorMinNumElements() &&
11775 "Invalid subvector index!");
11776 SDValue LoVec0 = DAG.getNode(AArch64ISD::UUNPKLO, DL, WideVT, Vec0);
11777 Narrow = DAG.getNode(AArch64ISD::UZP1, DL, NarrowVT, LoVec0, Vec1);
11780 return getSVESafeBitCast(VT, Narrow, DAG);
11783 if (Idx == 0 && isPackedVectorType(VT, DAG)) {
11784 // This will be matched by custom code during ISelDAGToDAG.
11785 if (Vec0.isUndef())
11788 Optional<unsigned> PredPattern =
11789 getSVEPredPatternFromNumElements(InVT.getVectorNumElements());
11790 auto PredTy = VT.changeVectorElementType(MVT::i1);
11791 SDValue PTrue = getPTrue(DAG, DL, PredTy, *PredPattern);
11792 SDValue ScalableVec1 = convertToScalableVector(DAG, VT, Vec1);
11793 return DAG.getNode(ISD::VSELECT, DL, VT, PTrue, ScalableVec1, Vec0);
11799 static bool isPow2Splat(SDValue Op, uint64_t &SplatVal, bool &Negated) {
11800 if (Op.getOpcode() != AArch64ISD::DUP &&
11801 Op.getOpcode() != ISD::SPLAT_VECTOR &&
11802 Op.getOpcode() != ISD::BUILD_VECTOR)
11805 if (Op.getOpcode() == ISD::BUILD_VECTOR &&
11806 !isAllConstantBuildVector(Op, SplatVal))
11809 if (Op.getOpcode() != ISD::BUILD_VECTOR &&
11810 !isa<ConstantSDNode>(Op->getOperand(0)))
11813 SplatVal = Op->getConstantOperandVal(0);
11814 if (Op.getValueType().getVectorElementType() != MVT::i64)
11815 SplatVal = (int32_t)SplatVal;
11818 if (isPowerOf2_64(SplatVal))
11822 if (isPowerOf2_64(-SplatVal)) {
11823 SplatVal = -SplatVal;
11830 SDValue AArch64TargetLowering::LowerDIV(SDValue Op, SelectionDAG &DAG) const {
11831 EVT VT = Op.getValueType();
11834 if (useSVEForFixedLengthVectorVT(VT, /*OverrideNEON=*/true))
11835 return LowerFixedLengthVectorIntDivideToSVE(Op, DAG);
11837 assert(VT.isScalableVector() && "Expected a scalable vector.");
11839 bool Signed = Op.getOpcode() == ISD::SDIV;
11840 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
11844 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
11845 SDValue Pg = getPredicateForScalableVector(DAG, dl, VT);
11847 DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, VT, Pg, Op->getOperand(0),
11848 DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32));
11850 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
11855 if (VT == MVT::nxv4i32 || VT == MVT::nxv2i64)
11856 return LowerToPredicatedOp(Op, DAG, PredOpcode);
11858 // SVE doesn't have i8 and i16 DIV operations; widen them to 32-bit
11859 // operations, and truncate the result.
11861 if (VT == MVT::nxv16i8)
11862 WidenedVT = MVT::nxv8i16;
11863 else if (VT == MVT::nxv8i16)
11864 WidenedVT = MVT::nxv4i32;
11866 llvm_unreachable("Unexpected Custom DIV operation");
11868 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
11869 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
11870 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(0));
11871 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, WidenedVT, Op.getOperand(1));
11872 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(0));
11873 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, WidenedVT, Op.getOperand(1));
11874 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Lo, Op1Lo);
11875 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0Hi, Op1Hi);
11876 return DAG.getNode(AArch64ISD::UZP1, dl, VT, ResultLo, ResultHi);
11879 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
11880 // Currently no fixed length shuffles that require SVE are legal.
11881 if (useSVEForFixedLengthVectorVT(VT))
11884 if (VT.getVectorNumElements() == 4 &&
11885 (VT.is128BitVector() || VT.is64BitVector())) {
11886 unsigned Cost = getPerfectShuffleCost(M);
11893 unsigned DummyUnsigned;
11895 return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
11896 isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
11897 isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
11898 // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
11899 isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
11900 isZIPMask(M, VT, DummyUnsigned) ||
11901 isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
11902 isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
11903 isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
11904 isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
11905 isConcatMask(M, VT, VT.getSizeInBits() == 128));
11908 bool AArch64TargetLowering::isVectorClearMaskLegal(ArrayRef<int> M,
11910 // Just delegate to the generic legality, clear masks aren't special.
11911 return isShuffleMaskLegal(M, VT);
11914 /// getVShiftImm - Check if this is a valid build_vector for the immediate
11915 /// operand of a vector shift operation, where all the elements of the
11916 /// build_vector must have the same constant integer value.
11917 static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
11918 // Ignore bit_converts.
11919 while (Op.getOpcode() == ISD::BITCAST)
11920 Op = Op.getOperand(0);
11921 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
11922 APInt SplatBits, SplatUndef;
11923 unsigned SplatBitSize;
11925 if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
11926 HasAnyUndefs, ElementBits) ||
11927 SplatBitSize > ElementBits)
11929 Cnt = SplatBits.getSExtValue();
11933 /// isVShiftLImm - Check if this is a valid build_vector for the immediate
11934 /// operand of a vector shift left operation. That value must be in the range:
11935 /// 0 <= Value < ElementBits for a left shift; or
11936 /// 0 <= Value <= ElementBits for a long left shift.
11937 static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
11938 assert(VT.isVector() && "vector shift count is not a vector type");
11939 int64_t ElementBits = VT.getScalarSizeInBits();
11940 if (!getVShiftImm(Op, ElementBits, Cnt))
11942 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
11945 /// isVShiftRImm - Check if this is a valid build_vector for the immediate
11946 /// operand of a vector shift right operation. The value must be in the range:
11947 /// 1 <= Value <= ElementBits for a right shift; or
11948 static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
11949 assert(VT.isVector() && "vector shift count is not a vector type");
11950 int64_t ElementBits = VT.getScalarSizeInBits();
11951 if (!getVShiftImm(Op, ElementBits, Cnt))
11953 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
11956 SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
11957 SelectionDAG &DAG) const {
11958 EVT VT = Op.getValueType();
11960 if (VT.getScalarType() == MVT::i1) {
11961 // Lower i1 truncate to `(x & 1) != 0`.
11963 EVT OpVT = Op.getOperand(0).getValueType();
11964 SDValue Zero = DAG.getConstant(0, dl, OpVT);
11965 SDValue One = DAG.getConstant(1, dl, OpVT);
11966 SDValue And = DAG.getNode(ISD::AND, dl, OpVT, Op.getOperand(0), One);
11967 return DAG.getSetCC(dl, VT, And, Zero, ISD::SETNE);
11970 if (!VT.isVector() || VT.isScalableVector())
11973 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
11974 return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
11979 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
11980 SelectionDAG &DAG) const {
11981 EVT VT = Op.getValueType();
11985 if (!Op.getOperand(1).getValueType().isVector())
11987 unsigned EltSize = VT.getScalarSizeInBits();
11989 switch (Op.getOpcode()) {
11991 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT))
11992 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_PRED);
11994 if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
11995 return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
11996 DAG.getConstant(Cnt, DL, MVT::i32));
11997 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
11998 DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
12000 Op.getOperand(0), Op.getOperand(1));
12003 if (VT.isScalableVector() || useSVEForFixedLengthVectorVT(VT)) {
12004 unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_PRED
12005 : AArch64ISD::SRL_PRED;
12006 return LowerToPredicatedOp(Op, DAG, Opc);
12009 // Right shift immediate
12010 if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
12012 (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
12013 return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
12014 DAG.getConstant(Cnt, DL, MVT::i32));
12017 // Right shift register. Note, there is not a shift right register
12018 // instruction, but the shift left register instruction takes a signed
12019 // value, where negative numbers specify a right shift.
12020 unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
12021 : Intrinsic::aarch64_neon_ushl;
12022 // negate the shift amount
12023 SDValue NegShift = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
12025 SDValue NegShiftLeft =
12026 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
12027 DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
12029 return NegShiftLeft;
12032 llvm_unreachable("unexpected shift opcode");
12035 static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
12036 AArch64CC::CondCode CC, bool NoNans, EVT VT,
12037 const SDLoc &dl, SelectionDAG &DAG) {
12038 EVT SrcVT = LHS.getValueType();
12039 assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
12040 "function only supposed to emit natural comparisons");
12042 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
12043 APInt CnstBits(VT.getSizeInBits(), 0);
12044 APInt UndefBits(VT.getSizeInBits(), 0);
12045 bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
12046 bool IsZero = IsCnst && (CnstBits == 0);
12048 if (SrcVT.getVectorElementType().isFloatingPoint()) {
12052 case AArch64CC::NE: {
12055 Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
12057 Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
12058 return DAG.getNOT(dl, Fcmeq, VT);
12060 case AArch64CC::EQ:
12062 return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
12063 return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
12064 case AArch64CC::GE:
12066 return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
12067 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
12068 case AArch64CC::GT:
12070 return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
12071 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
12072 case AArch64CC::LE:
12075 // If we ignore NaNs then we can use to the LS implementation.
12077 case AArch64CC::LS:
12079 return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
12080 return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
12081 case AArch64CC::LT:
12084 // If we ignore NaNs then we can use to the MI implementation.
12086 case AArch64CC::MI:
12088 return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
12089 return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
12096 case AArch64CC::NE: {
12099 Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
12101 Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
12102 return DAG.getNOT(dl, Cmeq, VT);
12104 case AArch64CC::EQ:
12106 return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
12107 return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
12108 case AArch64CC::GE:
12110 return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
12111 return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
12112 case AArch64CC::GT:
12114 return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
12115 return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
12116 case AArch64CC::LE:
12118 return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
12119 return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
12120 case AArch64CC::LS:
12121 return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
12122 case AArch64CC::LO:
12123 return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
12124 case AArch64CC::LT:
12126 return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
12127 return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
12128 case AArch64CC::HI:
12129 return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
12130 case AArch64CC::HS:
12131 return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
12135 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
12136 SelectionDAG &DAG) const {
12137 if (Op.getValueType().isScalableVector())
12138 return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
12140 if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
12141 return LowerFixedLengthVectorSetccToSVE(Op, DAG);
12143 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
12144 SDValue LHS = Op.getOperand(0);
12145 SDValue RHS = Op.getOperand(1);
12146 EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
12149 if (LHS.getValueType().getVectorElementType().isInteger()) {
12150 assert(LHS.getValueType() == RHS.getValueType());
12151 AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
12153 EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
12154 return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
12157 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
12159 // Make v4f16 (only) fcmp operations utilise vector instructions
12160 // v8f16 support will be a litle more complicated
12161 if (!FullFP16 && LHS.getValueType().getVectorElementType() == MVT::f16) {
12162 if (LHS.getValueType().getVectorNumElements() == 4) {
12163 LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, LHS);
12164 RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, RHS);
12165 SDValue NewSetcc = DAG.getSetCC(dl, MVT::v4i16, LHS, RHS, CC);
12166 DAG.ReplaceAllUsesWith(Op, NewSetcc);
12167 CmpVT = MVT::v4i32;
12172 assert((!FullFP16 && LHS.getValueType().getVectorElementType() != MVT::f16) ||
12173 LHS.getValueType().getVectorElementType() != MVT::f128);
12175 // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
12176 // clean. Some of them require two branches to implement.
12177 AArch64CC::CondCode CC1, CC2;
12179 changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
12181 bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs();
12183 EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
12184 if (!Cmp.getNode())
12187 if (CC2 != AArch64CC::AL) {
12189 EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
12190 if (!Cmp2.getNode())
12193 Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
12196 Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
12199 Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
12204 static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
12205 SelectionDAG &DAG) {
12206 SDValue VecOp = ScalarOp.getOperand(0);
12207 auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
12208 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
12209 DAG.getConstant(0, DL, MVT::i64));
12212 SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
12213 SelectionDAG &DAG) const {
12214 SDValue Src = Op.getOperand(0);
12216 // Try to lower fixed length reductions to SVE.
12217 EVT SrcVT = Src.getValueType();
12218 bool OverrideNEON = Op.getOpcode() == ISD::VECREDUCE_AND ||
12219 Op.getOpcode() == ISD::VECREDUCE_OR ||
12220 Op.getOpcode() == ISD::VECREDUCE_XOR ||
12221 Op.getOpcode() == ISD::VECREDUCE_FADD ||
12222 (Op.getOpcode() != ISD::VECREDUCE_ADD &&
12223 SrcVT.getVectorElementType() == MVT::i64);
12224 if (SrcVT.isScalableVector() ||
12225 useSVEForFixedLengthVectorVT(
12226 SrcVT, OverrideNEON && Subtarget->useSVEForFixedLengthVectors())) {
12228 if (SrcVT.getVectorElementType() == MVT::i1)
12229 return LowerPredReductionToSVE(Op, DAG);
12231 switch (Op.getOpcode()) {
12232 case ISD::VECREDUCE_ADD:
12233 return LowerReductionToSVE(AArch64ISD::UADDV_PRED, Op, DAG);
12234 case ISD::VECREDUCE_AND:
12235 return LowerReductionToSVE(AArch64ISD::ANDV_PRED, Op, DAG);
12236 case ISD::VECREDUCE_OR:
12237 return LowerReductionToSVE(AArch64ISD::ORV_PRED, Op, DAG);
12238 case ISD::VECREDUCE_SMAX:
12239 return LowerReductionToSVE(AArch64ISD::SMAXV_PRED, Op, DAG);
12240 case ISD::VECREDUCE_SMIN:
12241 return LowerReductionToSVE(AArch64ISD::SMINV_PRED, Op, DAG);
12242 case ISD::VECREDUCE_UMAX:
12243 return LowerReductionToSVE(AArch64ISD::UMAXV_PRED, Op, DAG);
12244 case ISD::VECREDUCE_UMIN:
12245 return LowerReductionToSVE(AArch64ISD::UMINV_PRED, Op, DAG);
12246 case ISD::VECREDUCE_XOR:
12247 return LowerReductionToSVE(AArch64ISD::EORV_PRED, Op, DAG);
12248 case ISD::VECREDUCE_FADD:
12249 return LowerReductionToSVE(AArch64ISD::FADDV_PRED, Op, DAG);
12250 case ISD::VECREDUCE_FMAX:
12251 return LowerReductionToSVE(AArch64ISD::FMAXNMV_PRED, Op, DAG);
12252 case ISD::VECREDUCE_FMIN:
12253 return LowerReductionToSVE(AArch64ISD::FMINNMV_PRED, Op, DAG);
12255 llvm_unreachable("Unhandled fixed length reduction");
12259 // Lower NEON reductions.
12261 switch (Op.getOpcode()) {
12262 case ISD::VECREDUCE_ADD:
12263 return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
12264 case ISD::VECREDUCE_SMAX:
12265 return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
12266 case ISD::VECREDUCE_SMIN:
12267 return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
12268 case ISD::VECREDUCE_UMAX:
12269 return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
12270 case ISD::VECREDUCE_UMIN:
12271 return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
12272 case ISD::VECREDUCE_FMAX: {
12273 return DAG.getNode(
12274 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
12275 DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
12278 case ISD::VECREDUCE_FMIN: {
12279 return DAG.getNode(
12280 ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
12281 DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
12285 llvm_unreachable("Unhandled reduction");
12289 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_SUB(SDValue Op,
12290 SelectionDAG &DAG) const {
12291 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12292 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
12295 // LSE has an atomic load-add instruction, but not a load-sub.
12297 MVT VT = Op.getSimpleValueType();
12298 SDValue RHS = Op.getOperand(2);
12299 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
12300 RHS = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), RHS);
12301 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl, AN->getMemoryVT(),
12302 Op.getOperand(0), Op.getOperand(1), RHS,
12303 AN->getMemOperand());
12306 SDValue AArch64TargetLowering::LowerATOMIC_LOAD_AND(SDValue Op,
12307 SelectionDAG &DAG) const {
12308 auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
12309 if (!Subtarget.hasLSE() && !Subtarget.outlineAtomics())
12312 // LSE has an atomic load-clear instruction, but not a load-and.
12314 MVT VT = Op.getSimpleValueType();
12315 SDValue RHS = Op.getOperand(2);
12316 AtomicSDNode *AN = cast<AtomicSDNode>(Op.getNode());
12317 RHS = DAG.getNode(ISD::XOR, dl, VT, DAG.getConstant(-1ULL, dl, VT), RHS);
12318 return DAG.getAtomic(ISD::ATOMIC_LOAD_CLR, dl, AN->getMemoryVT(),
12319 Op.getOperand(0), Op.getOperand(1), RHS,
12320 AN->getMemOperand());
12323 SDValue AArch64TargetLowering::LowerWindowsDYNAMIC_STACKALLOC(
12324 SDValue Op, SDValue Chain, SDValue &Size, SelectionDAG &DAG) const {
12326 EVT PtrVT = getPointerTy(DAG.getDataLayout());
12327 SDValue Callee = DAG.getTargetExternalSymbol("__chkstk", PtrVT, 0);
12329 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
12330 const uint32_t *Mask = TRI->getWindowsStackProbePreservedMask();
12331 if (Subtarget->hasCustomCallingConv())
12332 TRI->UpdateCustomCallPreservedMask(DAG.getMachineFunction(), &Mask);
12334 Size = DAG.getNode(ISD::SRL, dl, MVT::i64, Size,
12335 DAG.getConstant(4, dl, MVT::i64));
12336 Chain = DAG.getCopyToReg(Chain, dl, AArch64::X15, Size, SDValue());
12338 DAG.getNode(AArch64ISD::CALL, dl, DAG.getVTList(MVT::Other, MVT::Glue),
12339 Chain, Callee, DAG.getRegister(AArch64::X15, MVT::i64),
12340 DAG.getRegisterMask(Mask), Chain.getValue(1));
12341 // To match the actual intent better, we should read the output from X15 here
12342 // again (instead of potentially spilling it to the stack), but rereading Size
12343 // from X15 here doesn't work at -O0, since it thinks that X15 is undefined
12346 Size = DAG.getNode(ISD::SHL, dl, MVT::i64, Size,
12347 DAG.getConstant(4, dl, MVT::i64));
12352 AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
12353 SelectionDAG &DAG) const {
12354 assert(Subtarget->isTargetWindows() &&
12355 "Only Windows alloca probing supported");
12358 SDNode *Node = Op.getNode();
12359 SDValue Chain = Op.getOperand(0);
12360 SDValue Size = Op.getOperand(1);
12362 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
12363 EVT VT = Node->getValueType(0);
12365 if (DAG.getMachineFunction().getFunction().hasFnAttribute(
12366 "no-stack-arg-probe")) {
12367 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
12368 Chain = SP.getValue(1);
12369 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
12371 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
12372 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
12373 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
12374 SDValue Ops[2] = {SP, Chain};
12375 return DAG.getMergeValues(Ops, dl);
12378 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
12380 Chain = LowerWindowsDYNAMIC_STACKALLOC(Op, Chain, Size, DAG);
12382 SDValue SP = DAG.getCopyFromReg(Chain, dl, AArch64::SP, MVT::i64);
12383 Chain = SP.getValue(1);
12384 SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
12386 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
12387 DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
12388 Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
12390 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
12391 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
12393 SDValue Ops[2] = {SP, Chain};
12394 return DAG.getMergeValues(Ops, dl);
12397 SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
12398 SelectionDAG &DAG) const {
12399 EVT VT = Op.getValueType();
12400 assert(VT != MVT::i64 && "Expected illegal VSCALE node");
12403 APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
12404 return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sext(64)), DL,
12408 /// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
12409 template <unsigned NumVecs>
12411 setInfoSVEStN(const AArch64TargetLowering &TLI, const DataLayout &DL,
12412 AArch64TargetLowering::IntrinsicInfo &Info, const CallInst &CI) {
12413 Info.opc = ISD::INTRINSIC_VOID;
12414 // Retrieve EC from first vector argument.
12415 const EVT VT = TLI.getMemValueType(DL, CI.getArgOperand(0)->getType());
12416 ElementCount EC = VT.getVectorElementCount();
12418 // Check the assumption that all input vectors are the same type.
12419 for (unsigned I = 0; I < NumVecs; ++I)
12420 assert(VT == TLI.getMemValueType(DL, CI.getArgOperand(I)->getType()) &&
12423 // memVT is `NumVecs * VT`.
12424 Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
12426 Info.ptrVal = CI.getArgOperand(CI.arg_size() - 1);
12428 Info.align.reset();
12429 Info.flags = MachineMemOperand::MOStore;
12433 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
12434 /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
12435 /// specified in the intrinsic calls.
12436 bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
12438 MachineFunction &MF,
12439 unsigned Intrinsic) const {
12440 auto &DL = I.getModule()->getDataLayout();
12441 switch (Intrinsic) {
12442 case Intrinsic::aarch64_sve_st2:
12443 return setInfoSVEStN<2>(*this, DL, Info, I);
12444 case Intrinsic::aarch64_sve_st3:
12445 return setInfoSVEStN<3>(*this, DL, Info, I);
12446 case Intrinsic::aarch64_sve_st4:
12447 return setInfoSVEStN<4>(*this, DL, Info, I);
12448 case Intrinsic::aarch64_neon_ld2:
12449 case Intrinsic::aarch64_neon_ld3:
12450 case Intrinsic::aarch64_neon_ld4:
12451 case Intrinsic::aarch64_neon_ld1x2:
12452 case Intrinsic::aarch64_neon_ld1x3:
12453 case Intrinsic::aarch64_neon_ld1x4:
12454 case Intrinsic::aarch64_neon_ld2lane:
12455 case Intrinsic::aarch64_neon_ld3lane:
12456 case Intrinsic::aarch64_neon_ld4lane:
12457 case Intrinsic::aarch64_neon_ld2r:
12458 case Intrinsic::aarch64_neon_ld3r:
12459 case Intrinsic::aarch64_neon_ld4r: {
12460 Info.opc = ISD::INTRINSIC_W_CHAIN;
12461 // Conservatively set memVT to the entire set of vectors loaded.
12462 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
12463 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12464 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
12466 Info.align.reset();
12467 // volatile loads with NEON intrinsics not supported
12468 Info.flags = MachineMemOperand::MOLoad;
12471 case Intrinsic::aarch64_neon_st2:
12472 case Intrinsic::aarch64_neon_st3:
12473 case Intrinsic::aarch64_neon_st4:
12474 case Intrinsic::aarch64_neon_st1x2:
12475 case Intrinsic::aarch64_neon_st1x3:
12476 case Intrinsic::aarch64_neon_st1x4:
12477 case Intrinsic::aarch64_neon_st2lane:
12478 case Intrinsic::aarch64_neon_st3lane:
12479 case Intrinsic::aarch64_neon_st4lane: {
12480 Info.opc = ISD::INTRINSIC_VOID;
12481 // Conservatively set memVT to the entire set of vectors stored.
12482 unsigned NumElts = 0;
12483 for (const Value *Arg : I.args()) {
12484 Type *ArgTy = Arg->getType();
12485 if (!ArgTy->isVectorTy())
12487 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
12489 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
12490 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
12492 Info.align.reset();
12493 // volatile stores with NEON intrinsics not supported
12494 Info.flags = MachineMemOperand::MOStore;
12497 case Intrinsic::aarch64_ldaxr:
12498 case Intrinsic::aarch64_ldxr: {
12499 Type *ValTy = I.getParamElementType(0);
12500 Info.opc = ISD::INTRINSIC_W_CHAIN;
12501 Info.memVT = MVT::getVT(ValTy);
12502 Info.ptrVal = I.getArgOperand(0);
12504 Info.align = DL.getABITypeAlign(ValTy);
12505 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
12508 case Intrinsic::aarch64_stlxr:
12509 case Intrinsic::aarch64_stxr: {
12510 Type *ValTy = I.getParamElementType(1);
12511 Info.opc = ISD::INTRINSIC_W_CHAIN;
12512 Info.memVT = MVT::getVT(ValTy);
12513 Info.ptrVal = I.getArgOperand(1);
12515 Info.align = DL.getABITypeAlign(ValTy);
12516 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
12519 case Intrinsic::aarch64_ldaxp:
12520 case Intrinsic::aarch64_ldxp:
12521 Info.opc = ISD::INTRINSIC_W_CHAIN;
12522 Info.memVT = MVT::i128;
12523 Info.ptrVal = I.getArgOperand(0);
12525 Info.align = Align(16);
12526 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
12528 case Intrinsic::aarch64_stlxp:
12529 case Intrinsic::aarch64_stxp:
12530 Info.opc = ISD::INTRINSIC_W_CHAIN;
12531 Info.memVT = MVT::i128;
12532 Info.ptrVal = I.getArgOperand(2);
12534 Info.align = Align(16);
12535 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
12537 case Intrinsic::aarch64_sve_ldnt1: {
12538 Type *ElTy = cast<VectorType>(I.getType())->getElementType();
12539 Info.opc = ISD::INTRINSIC_W_CHAIN;
12540 Info.memVT = MVT::getVT(I.getType());
12541 Info.ptrVal = I.getArgOperand(1);
12543 Info.align = DL.getABITypeAlign(ElTy);
12544 Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
12547 case Intrinsic::aarch64_sve_stnt1: {
12549 cast<VectorType>(I.getArgOperand(0)->getType())->getElementType();
12550 Info.opc = ISD::INTRINSIC_W_CHAIN;
12551 Info.memVT = MVT::getVT(I.getOperand(0)->getType());
12552 Info.ptrVal = I.getArgOperand(2);
12554 Info.align = DL.getABITypeAlign(ElTy);
12555 Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
12558 case Intrinsic::aarch64_mops_memset_tag: {
12559 Value *Dst = I.getArgOperand(0);
12560 Value *Val = I.getArgOperand(1);
12561 Info.opc = ISD::INTRINSIC_W_CHAIN;
12562 Info.memVT = MVT::getVT(Val->getType());
12565 Info.align = I.getParamAlign(0).valueOrOne();
12566 Info.flags = MachineMemOperand::MOStore;
12567 // The size of the memory being operated on is unknown at this point
12568 Info.size = MemoryLocation::UnknownSize;
12578 bool AArch64TargetLowering::shouldReduceLoadWidth(SDNode *Load,
12579 ISD::LoadExtType ExtTy,
12581 // TODO: This may be worth removing. Check regression tests for diffs.
12582 if (!TargetLoweringBase::shouldReduceLoadWidth(Load, ExtTy, NewVT))
12585 // If we're reducing the load width in order to avoid having to use an extra
12586 // instruction to do extension then it's probably a good idea.
12587 if (ExtTy != ISD::NON_EXTLOAD)
12589 // Don't reduce load width if it would prevent us from combining a shift into
12591 MemSDNode *Mem = dyn_cast<MemSDNode>(Load);
12593 const SDValue &Base = Mem->getBasePtr();
12594 if (Base.getOpcode() == ISD::ADD &&
12595 Base.getOperand(1).getOpcode() == ISD::SHL &&
12596 Base.getOperand(1).hasOneUse() &&
12597 Base.getOperand(1).getOperand(1).getOpcode() == ISD::Constant) {
12598 // It's unknown whether a scalable vector has a power-of-2 bitwidth.
12599 if (Mem->getMemoryVT().isScalableVector())
12601 // The shift can be combined if it matches the size of the value being
12602 // loaded (and so reducing the width would make it not match).
12603 uint64_t ShiftAmount = Base.getOperand(1).getConstantOperandVal(1);
12604 uint64_t LoadBytes = Mem->getMemoryVT().getSizeInBits()/8;
12605 if (ShiftAmount == Log2_32(LoadBytes))
12608 // We have no reason to disallow reducing the load width, so allow it.
12612 // Truncations from 64-bit GPR to 32-bit GPR is free.
12613 bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
12614 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
12616 uint64_t NumBits1 = Ty1->getPrimitiveSizeInBits().getFixedSize();
12617 uint64_t NumBits2 = Ty2->getPrimitiveSizeInBits().getFixedSize();
12618 return NumBits1 > NumBits2;
12620 bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
12621 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
12623 uint64_t NumBits1 = VT1.getFixedSizeInBits();
12624 uint64_t NumBits2 = VT2.getFixedSizeInBits();
12625 return NumBits1 > NumBits2;
12628 /// Check if it is profitable to hoist instruction in then/else to if.
12629 /// Not profitable if I and it's user can form a FMA instruction
12630 /// because we prefer FMSUB/FMADD.
12631 bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
12632 if (I->getOpcode() != Instruction::FMul)
12635 if (!I->hasOneUse())
12638 Instruction *User = I->user_back();
12640 if (!(User->getOpcode() == Instruction::FSub ||
12641 User->getOpcode() == Instruction::FAdd))
12644 const TargetOptions &Options = getTargetMachine().Options;
12645 const Function *F = I->getFunction();
12646 const DataLayout &DL = F->getParent()->getDataLayout();
12647 Type *Ty = User->getOperand(0)->getType();
12649 return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
12650 isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
12651 (Options.AllowFPOpFusion == FPOpFusion::Fast ||
12652 Options.UnsafeFPMath));
12655 // All 32-bit GPR operations implicitly zero the high-half of the corresponding
12657 bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
12658 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
12660 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
12661 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
12662 return NumBits1 == 32 && NumBits2 == 64;
12664 bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
12665 if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger())
12667 unsigned NumBits1 = VT1.getSizeInBits();
12668 unsigned NumBits2 = VT2.getSizeInBits();
12669 return NumBits1 == 32 && NumBits2 == 64;
12672 bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
12673 EVT VT1 = Val.getValueType();
12674 if (isZExtFree(VT1, VT2)) {
12678 if (Val.getOpcode() != ISD::LOAD)
12681 // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
12682 return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
12683 VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
12684 VT1.getSizeInBits() <= 32);
12687 bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
12688 if (isa<FPExtInst>(Ext))
12691 // Vector types are not free.
12692 if (Ext->getType()->isVectorTy())
12695 for (const Use &U : Ext->uses()) {
12696 // The extension is free if we can fold it with a left shift in an
12697 // addressing mode or an arithmetic operation: add, sub, and cmp.
12699 // Is there a shift?
12700 const Instruction *Instr = cast<Instruction>(U.getUser());
12702 // Is this a constant shift?
12703 switch (Instr->getOpcode()) {
12704 case Instruction::Shl:
12705 if (!isa<ConstantInt>(Instr->getOperand(1)))
12708 case Instruction::GetElementPtr: {
12709 gep_type_iterator GTI = gep_type_begin(Instr);
12710 auto &DL = Ext->getModule()->getDataLayout();
12711 std::advance(GTI, U.getOperandNo()-1);
12712 Type *IdxTy = GTI.getIndexedType();
12713 // This extension will end up with a shift because of the scaling factor.
12714 // 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
12715 // Get the shift amount based on the scaling factor:
12716 // log2(sizeof(IdxTy)) - log2(8).
12717 uint64_t ShiftAmt =
12718 countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy).getFixedSize()) - 3;
12719 // Is the constant foldable in the shift of the addressing mode?
12720 // I.e., shift amount is between 1 and 4 inclusive.
12721 if (ShiftAmt == 0 || ShiftAmt > 4)
12725 case Instruction::Trunc:
12726 // Check if this is a noop.
12727 // trunc(sext ty1 to ty2) to ty1.
12728 if (Instr->getType() == Ext->getOperand(0)->getType())
12735 // At this point we can use the bfm family, so this extension is free
12741 /// Check if both Op1 and Op2 are shufflevector extracts of either the lower
12742 /// or upper half of the vector elements.
12743 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
12744 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
12745 auto *FullTy = FullV->getType();
12746 auto *HalfTy = HalfV->getType();
12747 return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
12748 2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
12751 auto extractHalf = [](Value *FullV, Value *HalfV) {
12752 auto *FullVT = cast<FixedVectorType>(FullV->getType());
12753 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
12754 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
12757 ArrayRef<int> M1, M2;
12758 Value *S1Op1, *S2Op1;
12759 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
12760 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
12763 // Check that the operands are half as wide as the result and we extract
12764 // half of the elements of the input vectors.
12765 if (!areTypesHalfed(S1Op1, Op1) || !areTypesHalfed(S2Op1, Op2) ||
12766 !extractHalf(S1Op1, Op1) || !extractHalf(S2Op1, Op2))
12769 // Check the mask extracts either the lower or upper half of vector
12773 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
12774 if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
12775 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
12776 M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
12782 /// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
12783 /// of the vector elements.
12784 static bool areExtractExts(Value *Ext1, Value *Ext2) {
12785 auto areExtDoubled = [](Instruction *Ext) {
12786 return Ext->getType()->getScalarSizeInBits() ==
12787 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
12790 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
12791 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
12792 !areExtDoubled(cast<Instruction>(Ext1)) ||
12793 !areExtDoubled(cast<Instruction>(Ext2)))
12799 /// Check if Op could be used with vmull_high_p64 intrinsic.
12800 static bool isOperandOfVmullHighP64(Value *Op) {
12801 Value *VectorOperand = nullptr;
12802 ConstantInt *ElementIndex = nullptr;
12803 return match(Op, m_ExtractElt(m_Value(VectorOperand),
12804 m_ConstantInt(ElementIndex))) &&
12805 ElementIndex->getValue() == 1 &&
12806 isa<FixedVectorType>(VectorOperand->getType()) &&
12807 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
12810 /// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
12811 static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
12812 return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
12815 static bool isSplatShuffle(Value *V) {
12816 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
12817 return is_splat(Shuf->getShuffleMask());
12821 /// Check if sinking \p I's operands to I's basic block is profitable, because
12822 /// the operands can be folded into a target instruction, e.g.
12823 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
12824 bool AArch64TargetLowering::shouldSinkOperands(
12825 Instruction *I, SmallVectorImpl<Use *> &Ops) const {
12826 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
12827 switch (II->getIntrinsicID()) {
12828 case Intrinsic::aarch64_neon_smull:
12829 case Intrinsic::aarch64_neon_umull:
12830 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1))) {
12831 Ops.push_back(&II->getOperandUse(0));
12832 Ops.push_back(&II->getOperandUse(1));
12837 case Intrinsic::fma:
12838 if (isa<VectorType>(I->getType()) &&
12839 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
12840 !Subtarget->hasFullFP16())
12843 case Intrinsic::aarch64_neon_sqdmull:
12844 case Intrinsic::aarch64_neon_sqdmulh:
12845 case Intrinsic::aarch64_neon_sqrdmulh:
12846 // Sink splats for index lane variants
12847 if (isSplatShuffle(II->getOperand(0)))
12848 Ops.push_back(&II->getOperandUse(0));
12849 if (isSplatShuffle(II->getOperand(1)))
12850 Ops.push_back(&II->getOperandUse(1));
12851 return !Ops.empty();
12852 case Intrinsic::aarch64_sve_ptest_first:
12853 case Intrinsic::aarch64_sve_ptest_last:
12854 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
12855 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
12856 Ops.push_back(&II->getOperandUse(0));
12857 return !Ops.empty();
12858 case Intrinsic::aarch64_sme_write_horiz:
12859 case Intrinsic::aarch64_sme_write_vert:
12860 case Intrinsic::aarch64_sme_writeq_horiz:
12861 case Intrinsic::aarch64_sme_writeq_vert: {
12862 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
12863 if (!Idx || Idx->getOpcode() != Instruction::Add)
12865 Ops.push_back(&II->getOperandUse(1));
12868 case Intrinsic::aarch64_sme_read_horiz:
12869 case Intrinsic::aarch64_sme_read_vert:
12870 case Intrinsic::aarch64_sme_readq_horiz:
12871 case Intrinsic::aarch64_sme_readq_vert:
12872 case Intrinsic::aarch64_sme_ld1b_vert:
12873 case Intrinsic::aarch64_sme_ld1h_vert:
12874 case Intrinsic::aarch64_sme_ld1w_vert:
12875 case Intrinsic::aarch64_sme_ld1d_vert:
12876 case Intrinsic::aarch64_sme_ld1q_vert:
12877 case Intrinsic::aarch64_sme_st1b_vert:
12878 case Intrinsic::aarch64_sme_st1h_vert:
12879 case Intrinsic::aarch64_sme_st1w_vert:
12880 case Intrinsic::aarch64_sme_st1d_vert:
12881 case Intrinsic::aarch64_sme_st1q_vert:
12882 case Intrinsic::aarch64_sme_ld1b_horiz:
12883 case Intrinsic::aarch64_sme_ld1h_horiz:
12884 case Intrinsic::aarch64_sme_ld1w_horiz:
12885 case Intrinsic::aarch64_sme_ld1d_horiz:
12886 case Intrinsic::aarch64_sme_ld1q_horiz:
12887 case Intrinsic::aarch64_sme_st1b_horiz:
12888 case Intrinsic::aarch64_sme_st1h_horiz:
12889 case Intrinsic::aarch64_sme_st1w_horiz:
12890 case Intrinsic::aarch64_sme_st1d_horiz:
12891 case Intrinsic::aarch64_sme_st1q_horiz: {
12892 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
12893 if (!Idx || Idx->getOpcode() != Instruction::Add)
12895 Ops.push_back(&II->getOperandUse(3));
12898 case Intrinsic::aarch64_neon_pmull:
12899 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
12901 Ops.push_back(&II->getOperandUse(0));
12902 Ops.push_back(&II->getOperandUse(1));
12904 case Intrinsic::aarch64_neon_pmull64:
12905 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
12906 II->getArgOperand(1)))
12908 Ops.push_back(&II->getArgOperandUse(0));
12909 Ops.push_back(&II->getArgOperandUse(1));
12916 if (!I->getType()->isVectorTy())
12919 switch (I->getOpcode()) {
12920 case Instruction::Sub:
12921 case Instruction::Add: {
12922 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
12925 // If the exts' operands extract either the lower or upper elements, we
12926 // can sink them too.
12927 auto Ext1 = cast<Instruction>(I->getOperand(0));
12928 auto Ext2 = cast<Instruction>(I->getOperand(1));
12929 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
12930 Ops.push_back(&Ext1->getOperandUse(0));
12931 Ops.push_back(&Ext2->getOperandUse(0));
12934 Ops.push_back(&I->getOperandUse(0));
12935 Ops.push_back(&I->getOperandUse(1));
12939 case Instruction::Mul: {
12940 bool IsProfitable = false;
12941 for (auto &Op : I->operands()) {
12942 // Make sure we are not already sinking this operand
12943 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
12946 ShuffleVectorInst *Shuffle = dyn_cast<ShuffleVectorInst>(Op);
12947 if (!Shuffle || !Shuffle->isZeroEltSplat())
12950 Value *ShuffleOperand = Shuffle->getOperand(0);
12951 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
12955 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
12959 ConstantInt *ElementConstant =
12960 dyn_cast<ConstantInt>(Insert->getOperand(2));
12961 // Check that the insertelement is inserting into element 0
12962 if (!ElementConstant || ElementConstant->getZExtValue() != 0)
12965 unsigned Opcode = OperandInstr->getOpcode();
12966 if (Opcode != Instruction::SExt && Opcode != Instruction::ZExt)
12969 Ops.push_back(&Shuffle->getOperandUse(0));
12970 Ops.push_back(&Op);
12971 IsProfitable = true;
12974 return IsProfitable;
12982 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
12983 Align &RequiredAligment) const {
12984 if (!LoadedType.isSimple() ||
12985 (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
12987 // Cyclone supports unaligned accesses.
12988 RequiredAligment = Align(1);
12989 unsigned NumBits = LoadedType.getSizeInBits();
12990 return NumBits == 32 || NumBits == 64;
12993 /// A helper function for determining the number of interleaved accesses we
12994 /// will generate when lowering accesses of the given type.
12995 unsigned AArch64TargetLowering::getNumInterleavedAccesses(
12996 VectorType *VecTy, const DataLayout &DL, bool UseScalable) const {
12997 unsigned VecSize = UseScalable ? Subtarget->getMinSVEVectorSizeInBits() : 128;
12998 return std::max<unsigned>(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize);
13001 MachineMemOperand::Flags
13002 AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
13003 if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
13004 I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
13005 return MOStridedAccess;
13006 return MachineMemOperand::MONone;
13009 bool AArch64TargetLowering::isLegalInterleavedAccessType(
13010 VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const {
13012 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
13013 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
13014 unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
13016 UseScalable = false;
13018 // Ensure the number of vector elements is greater than 1.
13019 if (NumElements < 2)
13022 // Ensure the element type is legal.
13023 if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
13026 if (Subtarget->useSVEForFixedLengthVectors() &&
13027 (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
13028 (VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
13029 isPowerOf2_32(NumElements) && VecSize > 128))) {
13030 UseScalable = true;
13034 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
13035 // 128 will be split into multiple interleaved accesses.
13036 return VecSize == 64 || VecSize % 128 == 0;
13039 static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
13040 if (VTy->getElementType() == Type::getDoubleTy(VTy->getContext()))
13041 return ScalableVectorType::get(VTy->getElementType(), 2);
13043 if (VTy->getElementType() == Type::getFloatTy(VTy->getContext()))
13044 return ScalableVectorType::get(VTy->getElementType(), 4);
13046 if (VTy->getElementType() == Type::getBFloatTy(VTy->getContext()))
13047 return ScalableVectorType::get(VTy->getElementType(), 8);
13049 if (VTy->getElementType() == Type::getHalfTy(VTy->getContext()))
13050 return ScalableVectorType::get(VTy->getElementType(), 8);
13052 if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext()))
13053 return ScalableVectorType::get(VTy->getElementType(), 2);
13055 if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext()))
13056 return ScalableVectorType::get(VTy->getElementType(), 4);
13058 if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext()))
13059 return ScalableVectorType::get(VTy->getElementType(), 8);
13061 if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext()))
13062 return ScalableVectorType::get(VTy->getElementType(), 16);
13064 llvm_unreachable("Cannot handle input vector type");
13067 /// Lower an interleaved load into a ldN intrinsic.
13069 /// E.g. Lower an interleaved load (Factor = 2):
13070 /// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
13071 /// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
13072 /// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
13075 /// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
13076 /// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
13077 /// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
13078 bool AArch64TargetLowering::lowerInterleavedLoad(
13079 LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
13080 ArrayRef<unsigned> Indices, unsigned Factor) const {
13081 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13082 "Invalid interleave factor");
13083 assert(!Shuffles.empty() && "Empty shufflevector input");
13084 assert(Shuffles.size() == Indices.size() &&
13085 "Unmatched number of shufflevectors and indices");
13087 const DataLayout &DL = LI->getModule()->getDataLayout();
13089 VectorType *VTy = Shuffles[0]->getType();
13091 // Skip if we do not have NEON and skip illegal vector types. We can
13092 // "legalize" wide vector types into multiple interleaved accesses as long as
13093 // the vector types are divisible by 128.
13095 if (!Subtarget->hasNEON() ||
13096 !isLegalInterleavedAccessType(VTy, DL, UseScalable))
13099 unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
13101 auto *FVTy = cast<FixedVectorType>(VTy);
13103 // A pointer vector can not be the return type of the ldN intrinsics. Need to
13104 // load integer vectors first and then convert to pointer vectors.
13105 Type *EltTy = FVTy->getElementType();
13106 if (EltTy->isPointerTy())
13108 FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
13110 // If we're going to generate more than one load, reset the sub-vector type
13111 // to something legal.
13112 FVTy = FixedVectorType::get(FVTy->getElementType(),
13113 FVTy->getNumElements() / NumLoads);
13116 UseScalable ? cast<VectorType>(getSVEContainerIRType(FVTy)) : FVTy;
13118 IRBuilder<> Builder(LI);
13120 // The base address of the load.
13121 Value *BaseAddr = LI->getPointerOperand();
13123 if (NumLoads > 1) {
13124 // We will compute the pointer operand of each load from the original base
13125 // address using GEPs. Cast the base address to a pointer to the scalar
13127 BaseAddr = Builder.CreateBitCast(
13129 LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
13134 ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())
13135 : LDVTy->getPointerTo(LI->getPointerAddressSpace());
13136 Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()),
13137 LDVTy->getElementCount());
13139 static const Intrinsic::ID SVELoadIntrs[3] = {
13140 Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret,
13141 Intrinsic::aarch64_sve_ld4_sret};
13142 static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2,
13143 Intrinsic::aarch64_neon_ld3,
13144 Intrinsic::aarch64_neon_ld4};
13147 LdNFunc = Intrinsic::getDeclaration(LI->getModule(),
13148 SVELoadIntrs[Factor - 2], {LDVTy});
13150 LdNFunc = Intrinsic::getDeclaration(
13151 LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy});
13153 // Holds sub-vectors extracted from the load intrinsic return values. The
13154 // sub-vectors are associated with the shufflevector instructions they will
13156 DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
13158 Value *PTrue = nullptr;
13160 Optional<unsigned> PgPattern =
13161 getSVEPredPatternFromNumElements(FVTy->getNumElements());
13162 if (Subtarget->getMinSVEVectorSizeInBits() ==
13163 Subtarget->getMaxSVEVectorSizeInBits() &&
13164 Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy))
13165 PgPattern = AArch64SVEPredPattern::all;
13168 ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), *PgPattern);
13169 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
13173 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
13175 // If we're generating more than one load, compute the base address of
13176 // subsequent loads as an offset from the previous.
13178 BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr,
13179 FVTy->getNumElements() * Factor);
13183 LdN = Builder.CreateCall(
13184 LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN");
13186 LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy),
13189 // Extract and store the sub-vectors returned by the load intrinsic.
13190 for (unsigned i = 0; i < Shuffles.size(); i++) {
13191 ShuffleVectorInst *SVI = Shuffles[i];
13192 unsigned Index = Indices[i];
13194 Value *SubVec = Builder.CreateExtractValue(LdN, Index);
13197 SubVec = Builder.CreateExtractVector(
13199 ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0));
13201 // Convert the integer vector to pointer vector if the element is pointer.
13202 if (EltTy->isPointerTy())
13203 SubVec = Builder.CreateIntToPtr(
13204 SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
13205 FVTy->getNumElements()));
13207 SubVecs[SVI].push_back(SubVec);
13211 // Replace uses of the shufflevector instructions with the sub-vectors
13212 // returned by the load intrinsic. If a shufflevector instruction is
13213 // associated with more than one sub-vector, those sub-vectors will be
13214 // concatenated into a single wide vector.
13215 for (ShuffleVectorInst *SVI : Shuffles) {
13216 auto &SubVec = SubVecs[SVI];
13218 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
13219 SVI->replaceAllUsesWith(WideVec);
13225 /// Lower an interleaved store into a stN intrinsic.
13227 /// E.g. Lower an interleaved store (Factor = 3):
13228 /// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
13229 /// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
13230 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
13233 /// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
13234 /// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
13235 /// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
13236 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
13238 /// Note that the new shufflevectors will be removed and we'll only generate one
13239 /// st3 instruction in CodeGen.
13241 /// Example for a more general valid mask (Factor 3). Lower:
13242 /// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
13243 /// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
13244 /// store <12 x i32> %i.vec, <12 x i32>* %ptr
13247 /// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
13248 /// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
13249 /// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
13250 /// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
13251 bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
13252 ShuffleVectorInst *SVI,
13253 unsigned Factor) const {
13254 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
13255 "Invalid interleave factor");
13257 auto *VecTy = cast<FixedVectorType>(SVI->getType());
13258 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
13260 unsigned LaneLen = VecTy->getNumElements() / Factor;
13261 Type *EltTy = VecTy->getElementType();
13262 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
13264 const DataLayout &DL = SI->getModule()->getDataLayout();
13267 // Skip if we do not have NEON and skip illegal vector types. We can
13268 // "legalize" wide vector types into multiple interleaved accesses as long as
13269 // the vector types are divisible by 128.
13270 if (!Subtarget->hasNEON() ||
13271 !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
13274 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
13276 Value *Op0 = SVI->getOperand(0);
13277 Value *Op1 = SVI->getOperand(1);
13278 IRBuilder<> Builder(SI);
13280 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
13281 // vectors to integer vectors.
13282 if (EltTy->isPointerTy()) {
13283 Type *IntTy = DL.getIntPtrType(EltTy);
13284 unsigned NumOpElts =
13285 cast<FixedVectorType>(Op0->getType())->getNumElements();
13287 // Convert to the corresponding integer vector.
13288 auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
13289 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
13290 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
13292 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
13295 // If we're going to generate more than one store, reset the lane length
13296 // and sub-vector type to something legal.
13297 LaneLen /= NumStores;
13298 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
13300 auto *STVTy = UseScalable ? cast<VectorType>(getSVEContainerIRType(SubVecTy))
13303 // The base address of the store.
13304 Value *BaseAddr = SI->getPointerOperand();
13306 if (NumStores > 1) {
13307 // We will compute the pointer operand of each store from the original base
13308 // address using GEPs. Cast the base address to a pointer to the scalar
13310 BaseAddr = Builder.CreateBitCast(
13312 SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
13315 auto Mask = SVI->getShuffleMask();
13319 ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace())
13320 : STVTy->getPointerTo(SI->getPointerAddressSpace());
13321 Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()),
13322 STVTy->getElementCount());
13324 static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2,
13325 Intrinsic::aarch64_sve_st3,
13326 Intrinsic::aarch64_sve_st4};
13327 static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2,
13328 Intrinsic::aarch64_neon_st3,
13329 Intrinsic::aarch64_neon_st4};
13332 StNFunc = Intrinsic::getDeclaration(SI->getModule(),
13333 SVEStoreIntrs[Factor - 2], {STVTy});
13335 StNFunc = Intrinsic::getDeclaration(
13336 SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy});
13338 Value *PTrue = nullptr;
13340 Optional<unsigned> PgPattern =
13341 getSVEPredPatternFromNumElements(SubVecTy->getNumElements());
13342 if (Subtarget->getMinSVEVectorSizeInBits() ==
13343 Subtarget->getMaxSVEVectorSizeInBits() &&
13344 Subtarget->getMinSVEVectorSizeInBits() ==
13345 DL.getTypeSizeInBits(SubVecTy))
13346 PgPattern = AArch64SVEPredPattern::all;
13349 ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), *PgPattern);
13350 PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy},
13354 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
13356 SmallVector<Value *, 5> Ops;
13358 // Split the shufflevector operands into sub vectors for the new stN call.
13359 for (unsigned i = 0; i < Factor; i++) {
13361 unsigned IdxI = StoreCount * LaneLen * Factor + i;
13362 if (Mask[IdxI] >= 0) {
13363 Shuffle = Builder.CreateShuffleVector(
13364 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0));
13366 unsigned StartMask = 0;
13367 for (unsigned j = 1; j < LaneLen; j++) {
13368 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
13369 if (Mask[IdxJ * Factor + IdxI] >= 0) {
13370 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
13374 // Note: Filling undef gaps with random elements is ok, since
13375 // those elements were being written anyway (with undefs).
13376 // In the case of all undefs we're defaulting to using elems from 0
13377 // Note: StartMask cannot be negative, it's checked in
13378 // isReInterleaveMask
13379 Shuffle = Builder.CreateShuffleVector(
13380 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0));
13384 Shuffle = Builder.CreateInsertVector(
13385 STVTy, UndefValue::get(STVTy), Shuffle,
13386 ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0));
13388 Ops.push_back(Shuffle);
13392 Ops.push_back(PTrue);
13394 // If we generating more than one store, we compute the base address of
13395 // subsequent stores as an offset from the previous.
13396 if (StoreCount > 0)
13397 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
13398 BaseAddr, LaneLen * Factor);
13400 Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
13401 Builder.CreateCall(StNFunc, Ops);
13406 // Lower an SVE structured load intrinsic returning a tuple type to target
13407 // specific intrinsic taking the same input but returning a multi-result value
13408 // of the split tuple type.
13410 // E.g. Lowering an LD3:
13412 // call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
13413 // <vscale x 4 x i1> %pred,
13414 // <vscale x 4 x i32>* %addr)
13418 // t0: ch = EntryToken
13419 // t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
13420 // t4: i64,ch = CopyFromReg t0, Register:i64 %1
13421 // t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
13422 // t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
13424 // This is called pre-legalization to avoid widening/splitting issues with
13425 // non-power-of-2 tuple types used for LD3, such as nxv12i32.
13426 SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
13427 ArrayRef<SDValue> LoadOps,
13428 EVT VT, SelectionDAG &DAG,
13429 const SDLoc &DL) const {
13430 assert(VT.isScalableVector() && "Can only lower scalable vectors");
13432 unsigned N, Opcode;
13433 static const std::pair<unsigned, std::pair<unsigned, unsigned>>
13435 {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
13436 {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
13437 {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
13439 std::tie(N, Opcode) = llvm::find_if(IntrinsicMap, [&](auto P) {
13440 return P.first == Intrinsic;
13442 assert(VT.getVectorElementCount().getKnownMinValue() % N == 0 &&
13443 "invalid tuple vector type!");
13446 EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
13447 VT.getVectorElementCount().divideCoefficientBy(N));
13448 assert(isTypeLegal(SplitVT));
13450 SmallVector<EVT, 5> VTs(N, SplitVT);
13451 VTs.push_back(MVT::Other); // Chain
13452 SDVTList NodeTys = DAG.getVTList(VTs);
13454 SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
13455 SmallVector<SDValue, 4> PseudoLoadOps;
13456 for (unsigned I = 0; I < N; ++I)
13457 PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
13458 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
13461 EVT AArch64TargetLowering::getOptimalMemOpType(
13462 const MemOp &Op, const AttributeList &FuncAttributes) const {
13463 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
13464 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
13465 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
13466 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
13467 // taken one instruction to materialize the v2i64 zero and one store (with
13468 // restrictive addressing mode). Just do i64 stores.
13469 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
13470 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
13471 if (Op.isAligned(AlignCheck))
13474 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
13475 MachineMemOperand::MONone, &Fast) &&
13479 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
13480 AlignmentIsAcceptable(MVT::v16i8, Align(16)))
13482 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
13484 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
13486 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
13491 LLT AArch64TargetLowering::getOptimalMemOpLLT(
13492 const MemOp &Op, const AttributeList &FuncAttributes) const {
13493 bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat);
13494 bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
13495 bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
13496 // Only use AdvSIMD to implement memset of 32-byte and above. It would have
13497 // taken one instruction to materialize the v2i64 zero and one store (with
13498 // restrictive addressing mode). Just do i64 stores.
13499 bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
13500 auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
13501 if (Op.isAligned(AlignCheck))
13504 return allowsMisalignedMemoryAccesses(VT, 0, Align(1),
13505 MachineMemOperand::MONone, &Fast) &&
13509 if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
13510 AlignmentIsAcceptable(MVT::v2i64, Align(16)))
13511 return LLT::fixed_vector(2, 64);
13512 if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
13513 return LLT::scalar(128);
13514 if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
13515 return LLT::scalar(64);
13516 if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
13517 return LLT::scalar(32);
13521 // 12-bit optionally shifted immediates are legal for adds.
13522 bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
13523 if (Immed == std::numeric_limits<int64_t>::min()) {
13524 LLVM_DEBUG(dbgs() << "Illegal add imm " << Immed
13525 << ": avoid UB for INT64_MIN\n");
13528 // Same encoding for add/sub, just flip the sign.
13529 Immed = std::abs(Immed);
13530 bool IsLegal = ((Immed >> 12) == 0 ||
13531 ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
13532 LLVM_DEBUG(dbgs() << "Is " << Immed
13533 << " legal add imm: " << (IsLegal ? "yes" : "no") << "\n");
13537 // Return false to prevent folding
13538 // (mul (add x, c1), c2) -> (add (mul x, c2), c2*c1) in DAGCombine,
13539 // if the folding leads to worse code.
13540 bool AArch64TargetLowering::isMulAddWithConstProfitable(
13541 SDValue AddNode, SDValue ConstNode) const {
13542 // Let the DAGCombiner decide for vector types and large types.
13543 const EVT VT = AddNode.getValueType();
13544 if (VT.isVector() || VT.getScalarSizeInBits() > 64)
13547 // It is worse if c1 is legal add immediate, while c1*c2 is not
13548 // and has to be composed by at least two instructions.
13549 const ConstantSDNode *C1Node = cast<ConstantSDNode>(AddNode.getOperand(1));
13550 const ConstantSDNode *C2Node = cast<ConstantSDNode>(ConstNode);
13551 const int64_t C1 = C1Node->getSExtValue();
13552 const APInt C1C2 = C1Node->getAPIntValue() * C2Node->getAPIntValue();
13553 if (!isLegalAddImmediate(C1) || isLegalAddImmediate(C1C2.getSExtValue()))
13555 SmallVector<AArch64_IMM::ImmInsnModel, 4> Insn;
13556 AArch64_IMM::expandMOVImm(C1C2.getZExtValue(), VT.getSizeInBits(), Insn);
13557 if (Insn.size() > 1)
13560 // Default to true and let the DAGCombiner decide.
13564 // Integer comparisons are implemented with ADDS/SUBS, so the range of valid
13565 // immediates is the same as for an add or a sub.
13566 bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
13567 return isLegalAddImmediate(Immed);
13570 /// isLegalAddressingMode - Return true if the addressing mode represented
13571 /// by AM is legal for this target, for a load/store of the specified type.
13572 bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
13573 const AddrMode &AM, Type *Ty,
13574 unsigned AS, Instruction *I) const {
13575 // AArch64 has five basic addressing modes:
13577 // reg + 9-bit signed offset
13578 // reg + SIZE_IN_BYTES * 12-bit unsigned offset
13580 // reg + SIZE_IN_BYTES * reg
13582 // No global is ever allowed as a base.
13586 // No reg+reg+imm addressing.
13587 if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
13590 // FIXME: Update this method to support scalable addressing modes.
13591 if (isa<ScalableVectorType>(Ty)) {
13592 uint64_t VecElemNumBytes =
13593 DL.getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()) / 8;
13594 return AM.HasBaseReg && !AM.BaseOffs &&
13595 (AM.Scale == 0 || (uint64_t)AM.Scale == VecElemNumBytes);
13598 // check reg + imm case:
13599 // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
13600 uint64_t NumBytes = 0;
13601 if (Ty->isSized()) {
13602 uint64_t NumBits = DL.getTypeSizeInBits(Ty);
13603 NumBytes = NumBits / 8;
13604 if (!isPowerOf2_64(NumBits))
13609 int64_t Offset = AM.BaseOffs;
13611 // 9-bit signed offset
13612 if (isInt<9>(Offset))
13615 // 12-bit unsigned offset
13616 unsigned shift = Log2_64(NumBytes);
13617 if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
13618 // Must be a multiple of NumBytes (NumBytes is a power of 2)
13619 (Offset >> shift) << shift == Offset)
13624 // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
13626 return AM.Scale == 1 || (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
13629 bool AArch64TargetLowering::shouldConsiderGEPOffsetSplit() const {
13630 // Consider splitting large offset of struct or array.
13634 InstructionCost AArch64TargetLowering::getScalingFactorCost(
13635 const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const {
13636 // Scaling factors are not free at all.
13637 // Operands | Rt Latency
13638 // -------------------------------------------
13639 // Rt, [Xn, Xm] | 4
13640 // -------------------------------------------
13641 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
13642 // Rt, [Xn, Wm, <extend> #imm] |
13643 if (isLegalAddressingMode(DL, AM, Ty, AS))
13644 // Scale represents reg2 * scale, thus account for 1 if
13645 // it is not equal to 0 or 1.
13646 return AM.Scale != 0 && AM.Scale != 1;
13650 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
13651 const MachineFunction &MF, EVT VT) const {
13652 VT = VT.getScalarType();
13654 if (!VT.isSimple())
13657 switch (VT.getSimpleVT().SimpleTy) {
13659 return Subtarget->hasFullFP16();
13670 bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
13672 switch (Ty->getScalarType()->getTypeID()) {
13673 case Type::FloatTyID:
13674 case Type::DoubleTyID:
13681 bool AArch64TargetLowering::generateFMAsInMachineCombiner(
13682 EVT VT, CodeGenOpt::Level OptLevel) const {
13683 return (OptLevel >= CodeGenOpt::Aggressive) && !VT.isScalableVector() &&
13684 !useSVEForFixedLengthVectorVT(VT);
13688 AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
13689 // LR is a callee-save register, but we must treat it as clobbered by any call
13690 // site. Hence we include LR in the scratch registers, which are in turn added
13691 // as implicit-defs for stackmaps and patchpoints.
13692 static const MCPhysReg ScratchRegs[] = {
13693 AArch64::X16, AArch64::X17, AArch64::LR, 0
13695 return ScratchRegs;
13699 AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
13700 CombineLevel Level) const {
13701 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13702 N->getOpcode() == ISD::SRL) &&
13703 "Expected shift op");
13705 SDValue ShiftLHS = N->getOperand(0);
13706 EVT VT = N->getValueType(0);
13708 // If ShiftLHS is unsigned bit extraction: ((x >> C) & mask), then do not combine
13709 // it with shift 'N' to let it be lowered to UBFX.
13710 if (ShiftLHS.getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
13711 isa<ConstantSDNode>(ShiftLHS.getOperand(1))) {
13712 uint64_t TruncMask = ShiftLHS.getConstantOperandVal(1);
13713 if (isMask_64(TruncMask) &&
13714 ShiftLHS.getOperand(0).getOpcode() == ISD::SRL &&
13715 isa<ConstantSDNode>(ShiftLHS.getOperand(0).getOperand(1)))
13721 bool AArch64TargetLowering::isDesirableToCommuteXorWithShift(
13722 const SDNode *N) const {
13723 assert(N->getOpcode() == ISD::XOR &&
13724 (N->getOperand(0).getOpcode() == ISD::SHL ||
13725 N->getOperand(0).getOpcode() == ISD::SRL) &&
13726 "Expected XOR(SHIFT) pattern");
13728 // Only commute if the entire NOT mask is a hidden shifted mask.
13729 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13730 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13731 if (XorC && ShiftC) {
13732 unsigned MaskIdx, MaskLen;
13733 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13734 unsigned ShiftAmt = ShiftC->getZExtValue();
13735 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13736 if (N->getOperand(0).getOpcode() == ISD::SHL)
13737 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13738 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13745 bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
13746 const SDNode *N, CombineLevel Level) const {
13747 assert(((N->getOpcode() == ISD::SHL &&
13748 N->getOperand(0).getOpcode() == ISD::SRL) ||
13749 (N->getOpcode() == ISD::SRL &&
13750 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13751 "Expected shift-shift mask");
13752 // Don't allow multiuse shift folding with the same shift amount.
13753 if (!N->getOperand(0)->hasOneUse())
13756 // Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
13757 EVT VT = N->getValueType(0);
13758 if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
13759 auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13760 auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13761 return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
13767 bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
13769 assert(Ty->isIntegerTy());
13771 unsigned BitSize = Ty->getPrimitiveSizeInBits();
13775 int64_t Val = Imm.getSExtValue();
13776 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
13779 if ((int64_t)Val < 0)
13782 Val &= (1LL << 32) - 1;
13784 unsigned LZ = countLeadingZeros((uint64_t)Val);
13785 unsigned Shift = (63 - LZ) / 16;
13786 // MOVZ is free so return true for one or fewer MOVK.
13790 bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
13791 unsigned Index) const {
13792 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
13795 return (Index == 0 || Index == ResVT.getVectorMinNumElements());
13798 /// Turn vector tests of the signbit in the form of:
13799 /// xor (sra X, elt_size(X)-1), -1
13802 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
13803 const AArch64Subtarget *Subtarget) {
13804 EVT VT = N->getValueType(0);
13805 if (!Subtarget->hasNEON() || !VT.isVector())
13808 // There must be a shift right algebraic before the xor, and the xor must be a
13809 // 'not' operation.
13810 SDValue Shift = N->getOperand(0);
13811 SDValue Ones = N->getOperand(1);
13812 if (Shift.getOpcode() != AArch64ISD::VASHR || !Shift.hasOneUse() ||
13813 !ISD::isBuildVectorAllOnes(Ones.getNode()))
13816 // The shift should be smearing the sign bit across each vector element.
13817 auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
13818 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
13819 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
13822 return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
13825 // Given a vecreduce_add node, detect the below pattern and convert it to the
13826 // node sequence with UABDL, [S|U]ADB and UADDLP.
13828 // i32 vecreduce_add(
13831 // v16i32 [sign|zero]_extend(v16i8 a), v16i32 [sign|zero]_extend(v16i8 b))))
13832 // =================>
13833 // i32 vecreduce_add(
13837 // v8i8 [S|U]ABD low8:v16i8 a, low8:v16i8 b
13839 // v8i8 [S|U]ABD high8:v16i8 a, high8:v16i8 b
13840 static SDValue performVecReduceAddCombineWithUADDLP(SDNode *N,
13841 SelectionDAG &DAG) {
13842 // Assumed i32 vecreduce_add
13843 if (N->getValueType(0) != MVT::i32)
13846 SDValue VecReduceOp0 = N->getOperand(0);
13847 unsigned Opcode = VecReduceOp0.getOpcode();
13848 // Assumed v16i32 abs
13849 if (Opcode != ISD::ABS || VecReduceOp0->getValueType(0) != MVT::v16i32)
13852 SDValue ABS = VecReduceOp0;
13853 // Assumed v16i32 sub
13854 if (ABS->getOperand(0)->getOpcode() != ISD::SUB ||
13855 ABS->getOperand(0)->getValueType(0) != MVT::v16i32)
13858 SDValue SUB = ABS->getOperand(0);
13859 unsigned Opcode0 = SUB->getOperand(0).getOpcode();
13860 unsigned Opcode1 = SUB->getOperand(1).getOpcode();
13861 // Assumed v16i32 type
13862 if (SUB->getOperand(0)->getValueType(0) != MVT::v16i32 ||
13863 SUB->getOperand(1)->getValueType(0) != MVT::v16i32)
13866 // Assumed zext or sext
13867 bool IsZExt = false;
13868 if (Opcode0 == ISD::ZERO_EXTEND && Opcode1 == ISD::ZERO_EXTEND) {
13870 } else if (Opcode0 == ISD::SIGN_EXTEND && Opcode1 == ISD::SIGN_EXTEND) {
13875 SDValue EXT0 = SUB->getOperand(0);
13876 SDValue EXT1 = SUB->getOperand(1);
13877 // Assumed zext's operand has v16i8 type
13878 if (EXT0->getOperand(0)->getValueType(0) != MVT::v16i8 ||
13879 EXT1->getOperand(0)->getValueType(0) != MVT::v16i8)
13882 // Pattern is dectected. Let's convert it to sequence of nodes.
13885 // First, create the node pattern of UABD/SABD.
13886 SDValue UABDHigh8Op0 =
13887 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
13888 DAG.getConstant(8, DL, MVT::i64));
13889 SDValue UABDHigh8Op1 =
13890 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
13891 DAG.getConstant(8, DL, MVT::i64));
13892 SDValue UABDHigh8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
13893 UABDHigh8Op0, UABDHigh8Op1);
13894 SDValue UABDL = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDHigh8);
13896 // Second, create the node pattern of UABAL.
13897 SDValue UABDLo8Op0 =
13898 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT0->getOperand(0),
13899 DAG.getConstant(0, DL, MVT::i64));
13900 SDValue UABDLo8Op1 =
13901 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i8, EXT1->getOperand(0),
13902 DAG.getConstant(0, DL, MVT::i64));
13903 SDValue UABDLo8 = DAG.getNode(IsZExt ? ISD::ABDU : ISD::ABDS, DL, MVT::v8i8,
13904 UABDLo8Op0, UABDLo8Op1);
13905 SDValue ZExtUABD = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v8i16, UABDLo8);
13906 SDValue UABAL = DAG.getNode(ISD::ADD, DL, MVT::v8i16, UABDL, ZExtUABD);
13908 // Third, create the node of UADDLP.
13909 SDValue UADDLP = DAG.getNode(AArch64ISD::UADDLP, DL, MVT::v4i32, UABAL);
13911 // Fourth, create the node of VECREDUCE_ADD.
13912 return DAG.getNode(ISD::VECREDUCE_ADD, DL, MVT::i32, UADDLP);
13915 // Turn a v8i8/v16i8 extended vecreduce into a udot/sdot and vecreduce
13916 // vecreduce.add(ext(A)) to vecreduce.add(DOT(zero, A, one))
13917 // vecreduce.add(mul(ext(A), ext(B))) to vecreduce.add(DOT(zero, A, B))
13918 static SDValue performVecReduceAddCombine(SDNode *N, SelectionDAG &DAG,
13919 const AArch64Subtarget *ST) {
13920 if (!ST->hasDotProd())
13921 return performVecReduceAddCombineWithUADDLP(N, DAG);
13923 SDValue Op0 = N->getOperand(0);
13924 if (N->getValueType(0) != MVT::i32 ||
13925 Op0.getValueType().getVectorElementType() != MVT::i32)
13928 unsigned ExtOpcode = Op0.getOpcode();
13931 if (ExtOpcode == ISD::MUL) {
13932 A = Op0.getOperand(0);
13933 B = Op0.getOperand(1);
13934 if (A.getOpcode() != B.getOpcode() ||
13935 A.getOperand(0).getValueType() != B.getOperand(0).getValueType())
13937 ExtOpcode = A.getOpcode();
13939 if (ExtOpcode != ISD::ZERO_EXTEND && ExtOpcode != ISD::SIGN_EXTEND)
13942 EVT Op0VT = A.getOperand(0).getValueType();
13943 if (Op0VT != MVT::v8i8 && Op0VT != MVT::v16i8)
13947 // For non-mla reductions B can be set to 1. For MLA we take the operand of
13950 B = DAG.getConstant(1, DL, Op0VT);
13952 B = B.getOperand(0);
13955 DAG.getConstant(0, DL, Op0VT == MVT::v8i8 ? MVT::v2i32 : MVT::v4i32);
13957 (ExtOpcode == ISD::ZERO_EXTEND) ? AArch64ISD::UDOT : AArch64ISD::SDOT;
13958 SDValue Dot = DAG.getNode(DotOpcode, DL, Zeros.getValueType(), Zeros,
13959 A.getOperand(0), B);
13960 return DAG.getNode(ISD::VECREDUCE_ADD, DL, N->getValueType(0), Dot);
13963 // Given an (integer) vecreduce, we know the order of the inputs does not
13964 // matter. We can convert UADDV(add(zext(extract_lo(x)), zext(extract_hi(x))))
13965 // into UADDV(UADDLP(x)). This can also happen through an extra add, where we
13966 // transform UADDV(add(y, add(zext(extract_lo(x)), zext(extract_hi(x))))).
13967 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
13968 auto DetectAddExtract = [&](SDValue A) {
13969 // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
13970 // UADDLP(x) if found.
13971 if (A.getOpcode() != ISD::ADD)
13973 EVT VT = A.getValueType();
13974 SDValue Op0 = A.getOperand(0);
13975 SDValue Op1 = A.getOperand(1);
13976 if (Op0.getOpcode() != Op0.getOpcode() ||
13977 (Op0.getOpcode() != ISD::ZERO_EXTEND &&
13978 Op0.getOpcode() != ISD::SIGN_EXTEND))
13980 SDValue Ext0 = Op0.getOperand(0);
13981 SDValue Ext1 = Op1.getOperand(0);
13982 if (Ext0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13983 Ext1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
13984 Ext0.getOperand(0) != Ext1.getOperand(0))
13986 // Check that the type is twice the add types, and the extract are from
13987 // upper/lower parts of the same source.
13988 if (Ext0.getOperand(0).getValueType().getVectorNumElements() !=
13989 VT.getVectorNumElements() * 2)
13991 if ((Ext0.getConstantOperandVal(1) != 0 &&
13992 Ext1.getConstantOperandVal(1) != VT.getVectorNumElements()) &&
13993 (Ext1.getConstantOperandVal(1) != 0 &&
13994 Ext0.getConstantOperandVal(1) != VT.getVectorNumElements()))
13996 unsigned Opcode = Op0.getOpcode() == ISD::ZERO_EXTEND ? AArch64ISD::UADDLP
13997 : AArch64ISD::SADDLP;
13998 return DAG.getNode(Opcode, SDLoc(A), VT, Ext0.getOperand(0));
14001 SDValue A = N->getOperand(0);
14002 if (SDValue R = DetectAddExtract(A))
14003 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
14004 if (A.getOpcode() == ISD::ADD) {
14005 if (SDValue R = DetectAddExtract(A.getOperand(0)))
14006 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
14007 DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
14009 if (SDValue R = DetectAddExtract(A.getOperand(1)))
14010 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
14011 DAG.getNode(ISD::ADD, SDLoc(A), A.getValueType(), R,
14018 static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
14019 TargetLowering::DAGCombinerInfo &DCI,
14020 const AArch64Subtarget *Subtarget) {
14021 if (DCI.isBeforeLegalizeOps())
14024 return foldVectorXorShiftIntoCmp(N, DAG, Subtarget);
14028 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
14030 SmallVectorImpl<SDNode *> &Created) const {
14031 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
14032 if (isIntDivCheap(N->getValueType(0), Attr))
14033 return SDValue(N,0); // Lower SDIV as SDIV
14035 EVT VT = N->getValueType(0);
14037 // For scalable and fixed types, mark them as cheap so we can handle it much
14038 // later. This allows us to handle larger than legal types.
14039 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
14040 return SDValue(N, 0);
14042 // fold (sdiv X, pow2)
14043 if ((VT != MVT::i32 && VT != MVT::i64) ||
14044 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
14048 SDValue N0 = N->getOperand(0);
14049 unsigned Lg2 = Divisor.countTrailingZeros();
14050 SDValue Zero = DAG.getConstant(0, DL, VT);
14051 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
14053 // Add (N0 < 0) ? Pow2 - 1 : 0;
14055 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
14056 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
14057 SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
14059 Created.push_back(Cmp.getNode());
14060 Created.push_back(Add.getNode());
14061 Created.push_back(CSel.getNode());
14065 DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
14067 // If we're dividing by a positive value, we're done. Otherwise, we must
14068 // negate the result.
14069 if (Divisor.isNonNegative())
14072 Created.push_back(SRA.getNode());
14073 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
14077 AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
14079 SmallVectorImpl<SDNode *> &Created) const {
14080 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
14081 if (isIntDivCheap(N->getValueType(0), Attr))
14082 return SDValue(N, 0); // Lower SREM as SREM
14084 EVT VT = N->getValueType(0);
14086 // For scalable and fixed types, mark them as cheap so we can handle it much
14087 // later. This allows us to handle larger than legal types.
14088 if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
14089 return SDValue(N, 0);
14091 // fold (srem X, pow2)
14092 if ((VT != MVT::i32 && VT != MVT::i64) ||
14093 !(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
14096 unsigned Lg2 = Divisor.countTrailingZeros();
14101 SDValue N0 = N->getOperand(0);
14102 SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
14103 SDValue Zero = DAG.getConstant(0, DL, VT);
14104 SDValue CCVal, CSNeg;
14106 SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
14107 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
14108 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
14110 Created.push_back(Cmp.getNode());
14111 Created.push_back(And.getNode());
14113 SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
14114 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
14116 SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
14117 SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
14118 SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
14119 CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
14122 Created.push_back(Negs.getNode());
14123 Created.push_back(AndPos.getNode());
14124 Created.push_back(AndNeg.getNode());
14130 static bool IsSVECntIntrinsic(SDValue S) {
14131 switch(getIntrinsicID(S.getNode())) {
14134 case Intrinsic::aarch64_sve_cntb:
14135 case Intrinsic::aarch64_sve_cnth:
14136 case Intrinsic::aarch64_sve_cntw:
14137 case Intrinsic::aarch64_sve_cntd:
14143 /// Calculates what the pre-extend type is, based on the extension
14144 /// operation node provided by \p Extend.
14146 /// In the case that \p Extend is a SIGN_EXTEND or a ZERO_EXTEND, the
14147 /// pre-extend type is pulled directly from the operand, while other extend
14148 /// operations need a bit more inspection to get this information.
14150 /// \param Extend The SDNode from the DAG that represents the extend operation
14152 /// \returns The type representing the \p Extend source type, or \p MVT::Other
14153 /// if no valid type can be determined
14154 static EVT calculatePreExtendType(SDValue Extend) {
14155 switch (Extend.getOpcode()) {
14156 case ISD::SIGN_EXTEND:
14157 case ISD::ZERO_EXTEND:
14158 return Extend.getOperand(0).getValueType();
14159 case ISD::AssertSext:
14160 case ISD::AssertZext:
14161 case ISD::SIGN_EXTEND_INREG: {
14162 VTSDNode *TypeNode = dyn_cast<VTSDNode>(Extend.getOperand(1));
14165 return TypeNode->getVT();
14168 ConstantSDNode *Constant =
14169 dyn_cast<ConstantSDNode>(Extend.getOperand(1).getNode());
14173 uint32_t Mask = Constant->getZExtValue();
14175 if (Mask == UCHAR_MAX)
14177 else if (Mask == USHRT_MAX)
14179 else if (Mask == UINT_MAX)
14189 /// Combines a buildvector(sext/zext) or shuffle(sext/zext, undef) node pattern
14190 /// into sext/zext(buildvector) or sext/zext(shuffle) making use of the vector
14191 /// SExt/ZExt rather than the scalar SExt/ZExt
14192 static SDValue performBuildShuffleExtendCombine(SDValue BV, SelectionDAG &DAG) {
14193 EVT VT = BV.getValueType();
14194 if (BV.getOpcode() != ISD::BUILD_VECTOR &&
14195 BV.getOpcode() != ISD::VECTOR_SHUFFLE)
14198 // Use the first item in the buildvector/shuffle to get the size of the
14199 // extend, and make sure it looks valid.
14200 SDValue Extend = BV->getOperand(0);
14201 unsigned ExtendOpcode = Extend.getOpcode();
14202 bool IsSExt = ExtendOpcode == ISD::SIGN_EXTEND ||
14203 ExtendOpcode == ISD::SIGN_EXTEND_INREG ||
14204 ExtendOpcode == ISD::AssertSext;
14205 if (!IsSExt && ExtendOpcode != ISD::ZERO_EXTEND &&
14206 ExtendOpcode != ISD::AssertZext && ExtendOpcode != ISD::AND)
14208 // Shuffle inputs are vector, limit to SIGN_EXTEND and ZERO_EXTEND to ensure
14209 // calculatePreExtendType will work without issue.
14210 if (BV.getOpcode() == ISD::VECTOR_SHUFFLE &&
14211 ExtendOpcode != ISD::SIGN_EXTEND && ExtendOpcode != ISD::ZERO_EXTEND)
14214 // Restrict valid pre-extend data type
14215 EVT PreExtendType = calculatePreExtendType(Extend);
14216 if (PreExtendType == MVT::Other ||
14217 PreExtendType.getScalarSizeInBits() != VT.getScalarSizeInBits() / 2)
14220 // Make sure all other operands are equally extended
14221 for (SDValue Op : drop_begin(BV->ops())) {
14224 unsigned Opc = Op.getOpcode();
14225 bool OpcIsSExt = Opc == ISD::SIGN_EXTEND || Opc == ISD::SIGN_EXTEND_INREG ||
14226 Opc == ISD::AssertSext;
14227 if (OpcIsSExt != IsSExt || calculatePreExtendType(Op) != PreExtendType)
14233 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
14234 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType);
14235 EVT PreExtendLegalType =
14236 PreExtendType.getScalarSizeInBits() < 32 ? MVT::i32 : PreExtendType;
14237 SmallVector<SDValue, 8> NewOps;
14238 for (SDValue Op : BV->ops())
14239 NewOps.push_back(Op.isUndef() ? DAG.getUNDEF(PreExtendLegalType)
14240 : DAG.getAnyExtOrTrunc(Op.getOperand(0), DL,
14241 PreExtendLegalType));
14242 NBV = DAG.getNode(ISD::BUILD_VECTOR, DL, PreExtendVT, NewOps);
14243 } else { // BV.getOpcode() == ISD::VECTOR_SHUFFLE
14244 EVT PreExtendVT = VT.changeVectorElementType(PreExtendType.getScalarType());
14245 NBV = DAG.getVectorShuffle(PreExtendVT, DL, BV.getOperand(0).getOperand(0),
14246 BV.getOperand(1).isUndef()
14247 ? DAG.getUNDEF(PreExtendVT)
14248 : BV.getOperand(1).getOperand(0),
14249 cast<ShuffleVectorSDNode>(BV)->getMask());
14251 return DAG.getNode(IsSExt ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL, VT, NBV);
14254 /// Combines a mul(dup(sext/zext)) node pattern into mul(sext/zext(dup))
14255 /// making use of the vector SExt/ZExt rather than the scalar SExt/ZExt
14256 static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
14257 // If the value type isn't a vector, none of the operands are going to be dups
14258 EVT VT = Mul->getValueType(0);
14259 if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
14262 SDValue Op0 = performBuildShuffleExtendCombine(Mul->getOperand(0), DAG);
14263 SDValue Op1 = performBuildShuffleExtendCombine(Mul->getOperand(1), DAG);
14265 // Neither operands have been changed, don't make any further changes
14270 return DAG.getNode(Mul->getOpcode(), DL, VT, Op0 ? Op0 : Mul->getOperand(0),
14271 Op1 ? Op1 : Mul->getOperand(1));
14274 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
14275 // Same for other types with equivalent constants.
14276 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
14277 EVT VT = N->getValueType(0);
14278 if (VT != MVT::v2i64 && VT != MVT::v1i64 && VT != MVT::v2i32 &&
14279 VT != MVT::v4i32 && VT != MVT::v4i16 && VT != MVT::v8i16)
14281 if (N->getOperand(0).getOpcode() != ISD::AND ||
14282 N->getOperand(0).getOperand(0).getOpcode() != ISD::SRL)
14285 SDValue And = N->getOperand(0);
14286 SDValue Srl = And.getOperand(0);
14289 if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), V1) ||
14290 !ISD::isConstantSplatVector(And.getOperand(1).getNode(), V2) ||
14291 !ISD::isConstantSplatVector(Srl.getOperand(1).getNode(), V3))
14294 unsigned HalfSize = VT.getScalarSizeInBits() / 2;
14295 if (!V1.isMask(HalfSize) || V2 != (1ULL | 1ULL << HalfSize) ||
14296 V3 != (HalfSize - 1))
14299 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
14300 EVT::getIntegerVT(*DAG.getContext(), HalfSize),
14301 VT.getVectorElementCount() * 2);
14304 SDValue In = DAG.getNode(AArch64ISD::NVCAST, DL, HalfVT, Srl.getOperand(0));
14305 SDValue CM = DAG.getNode(AArch64ISD::CMLTz, DL, HalfVT, In);
14306 return DAG.getNode(AArch64ISD::NVCAST, DL, VT, CM);
14309 static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
14310 TargetLowering::DAGCombinerInfo &DCI,
14311 const AArch64Subtarget *Subtarget) {
14313 if (SDValue Ext = performMulVectorExtendCombine(N, DAG))
14315 if (SDValue Ext = performMulVectorCmpZeroCombine(N, DAG))
14318 if (DCI.isBeforeLegalizeOps())
14321 // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
14322 // and in MachineCombiner pass, add+mul will be combined into madd.
14323 // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
14325 EVT VT = N->getValueType(0);
14326 SDValue N0 = N->getOperand(0);
14327 SDValue N1 = N->getOperand(1);
14329 unsigned AddSubOpc;
14331 auto IsAddSubWith1 = [&](SDValue V) -> bool {
14332 AddSubOpc = V->getOpcode();
14333 if ((AddSubOpc == ISD::ADD || AddSubOpc == ISD::SUB) && V->hasOneUse()) {
14334 SDValue Opnd = V->getOperand(1);
14335 MulOper = V->getOperand(0);
14336 if (AddSubOpc == ISD::SUB)
14337 std::swap(Opnd, MulOper);
14338 if (auto C = dyn_cast<ConstantSDNode>(Opnd))
14344 if (IsAddSubWith1(N0)) {
14345 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N1, MulOper);
14346 return DAG.getNode(AddSubOpc, DL, VT, N1, MulVal);
14349 if (IsAddSubWith1(N1)) {
14350 SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, N0, MulOper);
14351 return DAG.getNode(AddSubOpc, DL, VT, N0, MulVal);
14354 // The below optimizations require a constant RHS.
14355 if (!isa<ConstantSDNode>(N1))
14358 ConstantSDNode *C = cast<ConstantSDNode>(N1);
14359 const APInt &ConstValue = C->getAPIntValue();
14361 // Allow the scaling to be folded into the `cnt` instruction by preventing
14362 // the scaling to be obscured here. This makes it easier to pattern match.
14363 if (IsSVECntIntrinsic(N0) ||
14364 (N0->getOpcode() == ISD::TRUNCATE &&
14365 (IsSVECntIntrinsic(N0->getOperand(0)))))
14366 if (ConstValue.sge(1) && ConstValue.sle(16))
14369 // Multiplication of a power of two plus/minus one can be done more
14370 // cheaply as as shift+add/sub. For now, this is true unilaterally. If
14371 // future CPUs have a cheaper MADD instruction, this may need to be
14372 // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
14373 // 64-bit is 5 cycles, so this is always a win.
14374 // More aggressively, some multiplications N0 * C can be lowered to
14375 // shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
14376 // e.g. 6=3*2=(2+1)*2.
14377 // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
14378 // which equals to (1+2)*16-(1+2).
14380 // TrailingZeroes is used to test if the mul can be lowered to
14381 // shift+add+shift.
14382 unsigned TrailingZeroes = ConstValue.countTrailingZeros();
14383 if (TrailingZeroes) {
14384 // Conservatively do not lower to shift+add+shift if the mul might be
14385 // folded into smul or umul.
14386 if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) ||
14387 isZeroExtended(N0.getNode(), DAG)))
14389 // Conservatively do not lower to shift+add+shift if the mul might be
14390 // folded into madd or msub.
14391 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD ||
14392 N->use_begin()->getOpcode() == ISD::SUB))
14395 // Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
14396 // and shift+add+shift.
14397 APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);
14400 // Is the shifted value the LHS operand of the add/sub?
14401 bool ShiftValUseIsN0 = true;
14402 // Do we need to negate the result?
14403 bool NegateResult = false;
14405 if (ConstValue.isNonNegative()) {
14406 // (mul x, 2^N + 1) => (add (shl x, N), x)
14407 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14408 // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
14409 APInt SCVMinus1 = ShiftedConstValue - 1;
14410 APInt CVPlus1 = ConstValue + 1;
14411 if (SCVMinus1.isPowerOf2()) {
14412 ShiftAmt = SCVMinus1.logBase2();
14413 AddSubOpc = ISD::ADD;
14414 } else if (CVPlus1.isPowerOf2()) {
14415 ShiftAmt = CVPlus1.logBase2();
14416 AddSubOpc = ISD::SUB;
14420 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14421 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14422 APInt CVNegPlus1 = -ConstValue + 1;
14423 APInt CVNegMinus1 = -ConstValue - 1;
14424 if (CVNegPlus1.isPowerOf2()) {
14425 ShiftAmt = CVNegPlus1.logBase2();
14426 AddSubOpc = ISD::SUB;
14427 ShiftValUseIsN0 = false;
14428 } else if (CVNegMinus1.isPowerOf2()) {
14429 ShiftAmt = CVNegMinus1.logBase2();
14430 AddSubOpc = ISD::ADD;
14431 NegateResult = true;
14436 SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
14437 DAG.getConstant(ShiftAmt, DL, MVT::i64));
14439 SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
14440 SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
14441 SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
14442 assert(!(NegateResult && TrailingZeroes) &&
14443 "NegateResult and TrailingZeroes cannot both be true for now.");
14444 // Negate the result.
14446 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
14447 // Shift the result.
14448 if (TrailingZeroes)
14449 return DAG.getNode(ISD::SHL, DL, VT, Res,
14450 DAG.getConstant(TrailingZeroes, DL, MVT::i64));
14454 static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
14455 SelectionDAG &DAG) {
14456 // Take advantage of vector comparisons producing 0 or -1 in each lane to
14457 // optimize away operation when it's from a constant.
14459 // The general transformation is:
14460 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
14461 // AND(VECTOR_CMP(x,y), constant2)
14462 // constant2 = UNARYOP(constant)
14464 // Early exit if this isn't a vector operation, the operand of the
14465 // unary operation isn't a bitwise AND, or if the sizes of the operations
14466 // aren't the same.
14467 EVT VT = N->getValueType(0);
14468 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
14469 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
14470 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
14473 // Now check that the other operand of the AND is a constant. We could
14474 // make the transformation for non-constant splats as well, but it's unclear
14475 // that would be a benefit as it would not eliminate any operations, just
14476 // perform one more step in scalar code before moving to the vector unit.
14477 if (BuildVectorSDNode *BV =
14478 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
14479 // Bail out if the vector isn't a constant.
14480 if (!BV->isConstant())
14483 // Everything checks out. Build up the new and improved node.
14485 EVT IntVT = BV->getValueType(0);
14486 // Create a new constant of the appropriate type for the transformed
14488 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
14489 // The AND node needs bitcasts to/from an integer vector type around it.
14490 SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
14491 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
14492 N->getOperand(0)->getOperand(0), MaskConst);
14493 SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
14500 static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
14501 const AArch64Subtarget *Subtarget) {
14502 // First try to optimize away the conversion when it's conditionally from
14503 // a constant. Vectors only.
14504 if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
14507 EVT VT = N->getValueType(0);
14508 if (VT != MVT::f32 && VT != MVT::f64)
14511 // Only optimize when the source and destination types have the same width.
14512 if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
14515 // If the result of an integer load is only used by an integer-to-float
14516 // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
14517 // This eliminates an "integer-to-vector-move" UOP and improves throughput.
14518 SDValue N0 = N->getOperand(0);
14519 if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
14520 // Do not change the width of a volatile load.
14521 !cast<LoadSDNode>(N0)->isVolatile()) {
14522 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
14523 SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
14524 LN0->getPointerInfo(), LN0->getAlign(),
14525 LN0->getMemOperand()->getFlags());
14527 // Make sure successors of the original load stay after it by updating them
14528 // to use the new Chain.
14529 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
14532 (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
14533 return DAG.getNode(Opcode, SDLoc(N), VT, Load);
14539 /// Fold a floating-point multiply by power of two into floating-point to
14540 /// fixed-point conversion.
14541 static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
14542 TargetLowering::DAGCombinerInfo &DCI,
14543 const AArch64Subtarget *Subtarget) {
14544 if (!Subtarget->hasNEON())
14547 if (!N->getValueType(0).isSimple())
14550 SDValue Op = N->getOperand(0);
14551 if (!Op.getValueType().isSimple() || Op.getOpcode() != ISD::FMUL)
14554 if (!Op.getValueType().is64BitVector() && !Op.getValueType().is128BitVector())
14557 SDValue ConstVec = Op->getOperand(1);
14558 if (!isa<BuildVectorSDNode>(ConstVec))
14561 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
14562 uint32_t FloatBits = FloatTy.getSizeInBits();
14563 if (FloatBits != 32 && FloatBits != 64 &&
14564 (FloatBits != 16 || !Subtarget->hasFullFP16()))
14567 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
14568 uint32_t IntBits = IntTy.getSizeInBits();
14569 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
14572 // Avoid conversions where iN is larger than the float (e.g., float -> i64).
14573 if (IntBits > FloatBits)
14576 BitVector UndefElements;
14577 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
14578 int32_t Bits = IntBits == 64 ? 64 : 32;
14579 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
14580 if (C == -1 || C == 0 || C > Bits)
14583 EVT ResTy = Op.getValueType().changeVectorElementTypeToInteger();
14584 if (!DAG.getTargetLoweringInfo().isTypeLegal(ResTy))
14587 if (N->getOpcode() == ISD::FP_TO_SINT_SAT ||
14588 N->getOpcode() == ISD::FP_TO_UINT_SAT) {
14589 EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
14590 if (SatVT.getScalarSizeInBits() != IntBits || IntBits != FloatBits)
14595 bool IsSigned = (N->getOpcode() == ISD::FP_TO_SINT ||
14596 N->getOpcode() == ISD::FP_TO_SINT_SAT);
14597 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
14598 : Intrinsic::aarch64_neon_vcvtfp2fxu;
14600 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
14601 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
14602 Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
14603 // We can handle smaller integers by generating an extra trunc.
14604 if (IntBits < FloatBits)
14605 FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);
14610 /// Fold a floating-point divide by power of two into fixed-point to
14611 /// floating-point conversion.
14612 static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
14613 TargetLowering::DAGCombinerInfo &DCI,
14614 const AArch64Subtarget *Subtarget) {
14615 if (!Subtarget->hasNEON())
14618 SDValue Op = N->getOperand(0);
14619 unsigned Opc = Op->getOpcode();
14620 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
14621 !Op.getOperand(0).getValueType().isSimple() ||
14622 (Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
14625 SDValue ConstVec = N->getOperand(1);
14626 if (!isa<BuildVectorSDNode>(ConstVec))
14629 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
14630 int32_t IntBits = IntTy.getSizeInBits();
14631 if (IntBits != 16 && IntBits != 32 && IntBits != 64)
14634 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
14635 int32_t FloatBits = FloatTy.getSizeInBits();
14636 if (FloatBits != 32 && FloatBits != 64)
14639 // Avoid conversions where iN is larger than the float (e.g., i64 -> float).
14640 if (IntBits > FloatBits)
14643 BitVector UndefElements;
14644 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
14645 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
14646 if (C == -1 || C == 0 || C > FloatBits)
14650 unsigned NumLanes = Op.getValueType().getVectorNumElements();
14651 switch (NumLanes) {
14655 ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
14658 ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
14662 if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
14666 SDValue ConvInput = Op.getOperand(0);
14667 bool IsSigned = Opc == ISD::SINT_TO_FP;
14668 if (IntBits < FloatBits)
14669 ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
14672 unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
14673 : Intrinsic::aarch64_neon_vcvtfxu2fp;
14674 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
14675 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
14676 DAG.getConstant(C, DL, MVT::i32));
14679 /// An EXTR instruction is made up of two shifts, ORed together. This helper
14680 /// searches for and classifies those shifts.
14681 static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
14683 if (N.getOpcode() == ISD::SHL)
14685 else if (N.getOpcode() == ISD::SRL)
14690 if (!isa<ConstantSDNode>(N.getOperand(1)))
14693 ShiftAmount = N->getConstantOperandVal(1);
14694 Src = N->getOperand(0);
14698 /// EXTR instruction extracts a contiguous chunk of bits from two existing
14699 /// registers viewed as a high/low pair. This function looks for the pattern:
14700 /// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
14701 /// with an EXTR. Can't quite be done in TableGen because the two immediates
14702 /// aren't independent.
14703 static SDValue tryCombineToEXTR(SDNode *N,
14704 TargetLowering::DAGCombinerInfo &DCI) {
14705 SelectionDAG &DAG = DCI.DAG;
14707 EVT VT = N->getValueType(0);
14709 assert(N->getOpcode() == ISD::OR && "Unexpected root");
14711 if (VT != MVT::i32 && VT != MVT::i64)
14715 uint32_t ShiftLHS = 0;
14716 bool LHSFromHi = false;
14717 if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
14721 uint32_t ShiftRHS = 0;
14722 bool RHSFromHi = false;
14723 if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
14726 // If they're both trying to come from the high part of the register, they're
14727 // not really an EXTR.
14728 if (LHSFromHi == RHSFromHi)
14731 if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
14735 std::swap(LHS, RHS);
14736 std::swap(ShiftLHS, ShiftRHS);
14739 return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
14740 DAG.getConstant(ShiftRHS, DL, MVT::i64));
14743 static SDValue tryCombineToBSL(SDNode *N,
14744 TargetLowering::DAGCombinerInfo &DCI) {
14745 EVT VT = N->getValueType(0);
14746 SelectionDAG &DAG = DCI.DAG;
14749 if (!VT.isVector())
14752 // The combining code currently only works for NEON vectors. In particular,
14753 // it does not work for SVE when dealing with vectors wider than 128 bits.
14754 if (!VT.is64BitVector() && !VT.is128BitVector())
14757 SDValue N0 = N->getOperand(0);
14758 if (N0.getOpcode() != ISD::AND)
14761 SDValue N1 = N->getOperand(1);
14762 if (N1.getOpcode() != ISD::AND)
14765 // InstCombine does (not (neg a)) => (add a -1).
14766 // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c)
14767 // Loop over all combinations of AND operands.
14768 for (int i = 1; i >= 0; --i) {
14769 for (int j = 1; j >= 0; --j) {
14770 SDValue O0 = N0->getOperand(i);
14771 SDValue O1 = N1->getOperand(j);
14772 SDValue Sub, Add, SubSibling, AddSibling;
14774 // Find a SUB and an ADD operand, one from each AND.
14775 if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) {
14778 SubSibling = N0->getOperand(1 - i);
14779 AddSibling = N1->getOperand(1 - j);
14780 } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) {
14783 AddSibling = N0->getOperand(1 - i);
14784 SubSibling = N1->getOperand(1 - j);
14788 if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode()))
14791 // Constant ones is always righthand operand of the Add.
14792 if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode()))
14795 if (Sub.getOperand(1) != Add.getOperand(0))
14798 return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling);
14802 // (or (and a b) (and (not a) c)) => (bsl a b c)
14803 // We only have to look for constant vectors here since the general, variable
14804 // case can be handled in TableGen.
14805 unsigned Bits = VT.getScalarSizeInBits();
14806 uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
14807 for (int i = 1; i >= 0; --i)
14808 for (int j = 1; j >= 0; --j) {
14809 BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
14810 BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
14811 if (!BVN0 || !BVN1)
14814 bool FoundMatch = true;
14815 for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
14816 ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
14817 ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
14818 if (!CN0 || !CN1 ||
14819 CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
14820 FoundMatch = false;
14826 return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
14827 N0->getOperand(1 - i), N1->getOperand(1 - j));
14833 // Given a tree of and/or(csel(0, 1, cc0), csel(0, 1, cc1)), we may be able to
14834 // convert to csel(ccmp(.., cc0)), depending on cc1:
14836 // (AND (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
14838 // (CSET cc1 (CCMP x1 y1 !cc1 cc0 cmp0))
14840 // (OR (CSET cc0 cmp0) (CSET cc1 (CMP x1 y1)))
14842 // (CSET cc1 (CCMP x1 y1 cc1 !cc0 cmp0))
14843 static SDValue performANDORCSELCombine(SDNode *N, SelectionDAG &DAG) {
14844 EVT VT = N->getValueType(0);
14845 SDValue CSel0 = N->getOperand(0);
14846 SDValue CSel1 = N->getOperand(1);
14848 if (CSel0.getOpcode() != AArch64ISD::CSEL ||
14849 CSel1.getOpcode() != AArch64ISD::CSEL)
14852 if (!CSel0->hasOneUse() || !CSel1->hasOneUse())
14855 if (!isNullConstant(CSel0.getOperand(0)) ||
14856 !isOneConstant(CSel0.getOperand(1)) ||
14857 !isNullConstant(CSel1.getOperand(0)) ||
14858 !isOneConstant(CSel1.getOperand(1)))
14861 SDValue Cmp0 = CSel0.getOperand(3);
14862 SDValue Cmp1 = CSel1.getOperand(3);
14863 AArch64CC::CondCode CC0 = (AArch64CC::CondCode)CSel0.getConstantOperandVal(2);
14864 AArch64CC::CondCode CC1 = (AArch64CC::CondCode)CSel1.getConstantOperandVal(2);
14865 if (!Cmp0->hasOneUse() || !Cmp1->hasOneUse())
14867 if (Cmp1.getOpcode() != AArch64ISD::SUBS &&
14868 Cmp0.getOpcode() == AArch64ISD::SUBS) {
14869 std::swap(Cmp0, Cmp1);
14870 std::swap(CC0, CC1);
14873 if (Cmp1.getOpcode() != AArch64ISD::SUBS)
14879 if (N->getOpcode() == ISD::AND) {
14880 AArch64CC::CondCode InvCC0 = AArch64CC::getInvertedCondCode(CC0);
14881 SDValue Condition = DAG.getConstant(InvCC0, DL, MVT_CC);
14882 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CC1);
14883 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
14884 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
14885 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
14888 AArch64CC::CondCode InvCC1 = AArch64CC::getInvertedCondCode(CC1);
14889 SDValue Condition = DAG.getConstant(CC0, DL, MVT_CC);
14890 unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvCC1);
14891 SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
14892 CCmp = DAG.getNode(AArch64ISD::CCMP, DL, MVT_CC, Cmp1.getOperand(0),
14893 Cmp1.getOperand(1), NZCVOp, Condition, Cmp0);
14895 return DAG.getNode(AArch64ISD::CSEL, DL, VT, CSel0.getOperand(0),
14896 CSel0.getOperand(1), DAG.getConstant(CC1, DL, MVT::i32),
14900 static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
14901 const AArch64Subtarget *Subtarget) {
14902 SelectionDAG &DAG = DCI.DAG;
14903 EVT VT = N->getValueType(0);
14905 if (SDValue R = performANDORCSELCombine(N, DAG))
14908 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14911 // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
14912 if (SDValue Res = tryCombineToEXTR(N, DCI))
14915 if (SDValue Res = tryCombineToBSL(N, DCI))
14921 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
14922 if (!MemVT.getVectorElementType().isSimple())
14925 uint64_t MaskForTy = 0ull;
14926 switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
14928 MaskForTy = 0xffull;
14931 MaskForTy = 0xffffull;
14934 MaskForTy = 0xffffffffull;
14941 if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
14942 if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
14943 return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
14948 static SDValue performSVEAndCombine(SDNode *N,
14949 TargetLowering::DAGCombinerInfo &DCI) {
14950 if (DCI.isBeforeLegalizeOps())
14953 SelectionDAG &DAG = DCI.DAG;
14954 SDValue Src = N->getOperand(0);
14955 unsigned Opc = Src->getOpcode();
14957 // Zero/any extend of an unsigned unpack
14958 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
14959 SDValue UnpkOp = Src->getOperand(0);
14960 SDValue Dup = N->getOperand(1);
14962 if (Dup.getOpcode() != ISD::SPLAT_VECTOR)
14966 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
14970 uint64_t ExtVal = C->getZExtValue();
14972 // If the mask is fully covered by the unpack, we don't need to push
14973 // a new AND onto the operand
14974 EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
14975 if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
14976 (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
14977 (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
14980 // Truncate to prevent a DUP with an over wide constant
14981 APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
14983 // Otherwise, make sure we propagate the AND to the operand
14985 Dup = DAG.getNode(ISD::SPLAT_VECTOR, DL, UnpkOp->getValueType(0),
14986 DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
14988 SDValue And = DAG.getNode(ISD::AND, DL,
14989 UnpkOp->getValueType(0), UnpkOp, Dup);
14991 return DAG.getNode(Opc, DL, N->getValueType(0), And);
14994 if (!EnableCombineMGatherIntrinsics)
14997 SDValue Mask = N->getOperand(1);
14999 if (!Src.hasOneUse())
15004 // SVE load instructions perform an implicit zero-extend, which makes them
15005 // perfect candidates for combining.
15007 case AArch64ISD::LD1_MERGE_ZERO:
15008 case AArch64ISD::LDNF1_MERGE_ZERO:
15009 case AArch64ISD::LDFF1_MERGE_ZERO:
15010 MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
15012 case AArch64ISD::GLD1_MERGE_ZERO:
15013 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
15014 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
15015 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
15016 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
15017 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
15018 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
15019 case AArch64ISD::GLDFF1_MERGE_ZERO:
15020 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
15021 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
15022 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
15023 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
15024 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
15025 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
15026 case AArch64ISD::GLDNT1_MERGE_ZERO:
15027 MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
15033 if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
15039 static SDValue performANDCombine(SDNode *N,
15040 TargetLowering::DAGCombinerInfo &DCI) {
15041 SelectionDAG &DAG = DCI.DAG;
15042 SDValue LHS = N->getOperand(0);
15043 SDValue RHS = N->getOperand(1);
15044 EVT VT = N->getValueType(0);
15046 if (SDValue R = performANDORCSELCombine(N, DAG))
15049 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
15052 if (VT.isScalableVector())
15053 return performSVEAndCombine(N, DCI);
15055 // The combining code below works only for NEON vectors. In particular, it
15056 // does not work for SVE when dealing with vectors wider than 128 bits.
15057 if (!VT.is64BitVector() && !VT.is128BitVector())
15060 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
15064 // AND does not accept an immediate, so check if we can use a BIC immediate
15065 // instruction instead. We do this here instead of using a (and x, (mvni imm))
15066 // pattern in isel, because some immediates may be lowered to the preferred
15067 // (and x, (movi imm)) form, even though an mvni representation also exists.
15068 APInt DefBits(VT.getSizeInBits(), 0);
15069 APInt UndefBits(VT.getSizeInBits(), 0);
15070 if (resolveBuildVector(BVN, DefBits, UndefBits)) {
15073 DefBits = ~DefBits;
15074 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
15076 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
15080 UndefBits = ~UndefBits;
15081 if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::BICi, SDValue(N, 0), DAG,
15082 UndefBits, &LHS)) ||
15083 (NewOp = tryAdvSIMDModImm16(AArch64ISD::BICi, SDValue(N, 0), DAG,
15091 static bool hasPairwiseAdd(unsigned Opcode, EVT VT, bool FullFP16) {
15093 case ISD::STRICT_FADD:
15095 return (FullFP16 && VT == MVT::f16) || VT == MVT::f32 || VT == MVT::f64;
15097 return VT == MVT::i64;
15103 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
15104 AArch64CC::CondCode Cond);
15106 static bool isPredicateCCSettingOp(SDValue N) {
15107 if ((N.getOpcode() == ISD::SETCC) ||
15108 (N.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
15109 (N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilege ||
15110 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilegt ||
15111 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehi ||
15112 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilehs ||
15113 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilele ||
15114 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelo ||
15115 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilels ||
15116 N.getConstantOperandVal(0) == Intrinsic::aarch64_sve_whilelt ||
15117 // get_active_lane_mask is lowered to a whilelo instruction.
15118 N.getConstantOperandVal(0) == Intrinsic::get_active_lane_mask)))
15124 // Materialize : i1 = extract_vector_elt t37, Constant:i64<0>
15125 // ... into: "ptrue p, all" + PTEST
15127 performFirstTrueTestVectorCombine(SDNode *N,
15128 TargetLowering::DAGCombinerInfo &DCI,
15129 const AArch64Subtarget *Subtarget) {
15130 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15131 // Make sure PTEST can be legalised with illegal types.
15132 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
15135 SDValue N0 = N->getOperand(0);
15136 EVT VT = N0.getValueType();
15138 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1 ||
15139 !isNullConstant(N->getOperand(1)))
15142 // Restricted the DAG combine to only cases where we're extracting from a
15143 // flag-setting operation.
15144 if (!isPredicateCCSettingOp(N0))
15147 // Extracts of lane 0 for SVE can be expressed as PTEST(Op, FIRST) ? 1 : 0
15148 SelectionDAG &DAG = DCI.DAG;
15149 SDValue Pg = getPTrue(DAG, SDLoc(N), VT, AArch64SVEPredPattern::all);
15150 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::FIRST_ACTIVE);
15153 // Materialize : Idx = (add (mul vscale, NumEls), -1)
15154 // i1 = extract_vector_elt t37, Constant:i64<Idx>
15155 // ... into: "ptrue p, all" + PTEST
15157 performLastTrueTestVectorCombine(SDNode *N,
15158 TargetLowering::DAGCombinerInfo &DCI,
15159 const AArch64Subtarget *Subtarget) {
15160 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15161 // Make sure PTEST is legal types.
15162 if (!Subtarget->hasSVE() || DCI.isBeforeLegalize())
15165 SDValue N0 = N->getOperand(0);
15166 EVT OpVT = N0.getValueType();
15168 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
15171 // Idx == (add (mul vscale, NumEls), -1)
15172 SDValue Idx = N->getOperand(1);
15173 if (Idx.getOpcode() != ISD::ADD || !isAllOnesConstant(Idx.getOperand(1)))
15176 SDValue VS = Idx.getOperand(0);
15177 if (VS.getOpcode() != ISD::VSCALE)
15180 unsigned NumEls = OpVT.getVectorElementCount().getKnownMinValue();
15181 if (VS.getConstantOperandVal(0) != NumEls)
15184 // Extracts of lane EC-1 for SVE can be expressed as PTEST(Op, LAST) ? 1 : 0
15185 SelectionDAG &DAG = DCI.DAG;
15186 SDValue Pg = getPTrue(DAG, SDLoc(N), OpVT, AArch64SVEPredPattern::all);
15187 return getPTest(DAG, N->getValueType(0), Pg, N0, AArch64CC::LAST_ACTIVE);
15191 performExtractVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15192 const AArch64Subtarget *Subtarget) {
15193 assert(N->getOpcode() == ISD::EXTRACT_VECTOR_ELT);
15194 if (SDValue Res = performFirstTrueTestVectorCombine(N, DCI, Subtarget))
15196 if (SDValue Res = performLastTrueTestVectorCombine(N, DCI, Subtarget))
15199 SelectionDAG &DAG = DCI.DAG;
15200 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
15201 ConstantSDNode *ConstantN1 = dyn_cast<ConstantSDNode>(N1);
15203 EVT VT = N->getValueType(0);
15204 const bool FullFP16 = DAG.getSubtarget<AArch64Subtarget>().hasFullFP16();
15205 bool IsStrict = N0->isStrictFPOpcode();
15207 // extract(dup x) -> x
15208 if (N0.getOpcode() == AArch64ISD::DUP)
15209 return DAG.getZExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
15211 // Rewrite for pairwise fadd pattern
15212 // (f32 (extract_vector_elt
15213 // (fadd (vXf32 Other)
15214 // (vector_shuffle (vXf32 Other) undef <1,X,...> )) 0))
15216 // (f32 (fadd (extract_vector_elt (vXf32 Other) 0)
15217 // (extract_vector_elt (vXf32 Other) 1))
15218 // For strict_fadd we need to make sure the old strict_fadd can be deleted, so
15219 // we can only do this when it's used only by the extract_vector_elt.
15220 if (ConstantN1 && ConstantN1->getZExtValue() == 0 &&
15221 hasPairwiseAdd(N0->getOpcode(), VT, FullFP16) &&
15222 (!IsStrict || N0.hasOneUse())) {
15224 SDValue N00 = N0->getOperand(IsStrict ? 1 : 0);
15225 SDValue N01 = N0->getOperand(IsStrict ? 2 : 1);
15227 ShuffleVectorSDNode *Shuffle = dyn_cast<ShuffleVectorSDNode>(N01);
15228 SDValue Other = N00;
15230 // And handle the commutative case.
15232 Shuffle = dyn_cast<ShuffleVectorSDNode>(N00);
15236 if (Shuffle && Shuffle->getMaskElt(0) == 1 &&
15237 Other == Shuffle->getOperand(0)) {
15238 SDValue Extract1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
15239 DAG.getConstant(0, DL, MVT::i64));
15240 SDValue Extract2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Other,
15241 DAG.getConstant(1, DL, MVT::i64));
15243 return DAG.getNode(N0->getOpcode(), DL, VT, Extract1, Extract2);
15245 // For strict_fadd we need uses of the final extract_vector to be replaced
15246 // with the strict_fadd, but we also need uses of the chain output of the
15247 // original strict_fadd to use the chain output of the new strict_fadd as
15248 // otherwise it may not be deleted.
15249 SDValue Ret = DAG.getNode(N0->getOpcode(), DL,
15251 {N0->getOperand(0), Extract1, Extract2});
15252 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Ret);
15253 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Ret.getValue(1));
15254 return SDValue(N, 0);
15261 static SDValue performConcatVectorsCombine(SDNode *N,
15262 TargetLowering::DAGCombinerInfo &DCI,
15263 SelectionDAG &DAG) {
15265 EVT VT = N->getValueType(0);
15266 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
15267 unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
15269 if (VT.isScalableVector())
15272 // Optimize concat_vectors of truncated vectors, where the intermediate
15273 // type is illegal, to avoid said illegality, e.g.,
15274 // (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
15275 // (v2i16 (truncate (v2i64)))))
15277 // (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
15278 // (v4i32 (bitcast (v2i64))),
15280 // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
15281 // on both input and result type, so we might generate worse code.
15282 // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
15283 if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
15284 N1Opc == ISD::TRUNCATE) {
15285 SDValue N00 = N0->getOperand(0);
15286 SDValue N10 = N1->getOperand(0);
15287 EVT N00VT = N00.getValueType();
15289 if (N00VT == N10.getValueType() &&
15290 (N00VT == MVT::v2i64 || N00VT == MVT::v4i32) &&
15291 N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
15292 MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
15293 SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
15294 for (size_t i = 0; i < Mask.size(); ++i)
15296 return DAG.getNode(ISD::TRUNCATE, dl, VT,
15297 DAG.getVectorShuffle(
15299 DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
15300 DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
15304 if (N->getOperand(0).getValueType() == MVT::v4i8) {
15305 // If we have a concat of v4i8 loads, convert them to a buildvector of f32
15306 // loads to prevent having to go through the v4i8 load legalization that
15307 // needs to extend each element into a larger type.
15308 if (N->getNumOperands() % 2 == 0 && all_of(N->op_values(), [](SDValue V) {
15309 if (V.getValueType() != MVT::v4i8)
15313 LoadSDNode *LD = dyn_cast<LoadSDNode>(V);
15314 return LD && V.hasOneUse() && LD->isSimple() && !LD->isIndexed() &&
15315 LD->getExtensionType() == ISD::NON_EXTLOAD;
15318 EVT::getVectorVT(*DAG.getContext(), MVT::f32, N->getNumOperands());
15319 SmallVector<SDValue> Ops;
15321 for (unsigned i = 0; i < N->getNumOperands(); i++) {
15322 SDValue V = N->getOperand(i);
15324 Ops.push_back(DAG.getUNDEF(MVT::f32));
15326 LoadSDNode *LD = cast<LoadSDNode>(V);
15328 DAG.getLoad(MVT::f32, dl, LD->getChain(), LD->getBasePtr(),
15329 LD->getMemOperand());
15330 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLoad.getValue(1));
15331 Ops.push_back(NewLoad);
15334 return DAG.getBitcast(N->getValueType(0),
15335 DAG.getBuildVector(NVT, dl, Ops));
15340 // Wait 'til after everything is legalized to try this. That way we have
15341 // legal vector types and such.
15342 if (DCI.isBeforeLegalizeOps())
15345 // Optimise concat_vectors of two [us]avgceils or [us]avgfloors that use
15346 // extracted subvectors from the same original vectors. Combine these into a
15347 // single avg that operates on the two original vectors.
15348 // avgceil is the target independant name for rhadd, avgfloor is a hadd.
15350 // (concat_vectors (v8i8 (avgceils (extract_subvector (v16i8 OpA, <0>),
15351 // extract_subvector (v16i8 OpB, <0>))),
15352 // (v8i8 (avgceils (extract_subvector (v16i8 OpA, <8>),
15353 // extract_subvector (v16i8 OpB, <8>)))))
15355 // (v16i8(avgceils(v16i8 OpA, v16i8 OpB)))
15356 if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
15357 (N0Opc == ISD::AVGCEILU || N0Opc == ISD::AVGCEILS ||
15358 N0Opc == ISD::AVGFLOORU || N0Opc == ISD::AVGFLOORS)) {
15359 SDValue N00 = N0->getOperand(0);
15360 SDValue N01 = N0->getOperand(1);
15361 SDValue N10 = N1->getOperand(0);
15362 SDValue N11 = N1->getOperand(1);
15364 EVT N00VT = N00.getValueType();
15365 EVT N10VT = N10.getValueType();
15367 if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15368 N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15369 N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15370 N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
15371 SDValue N00Source = N00->getOperand(0);
15372 SDValue N01Source = N01->getOperand(0);
15373 SDValue N10Source = N10->getOperand(0);
15374 SDValue N11Source = N11->getOperand(0);
15376 if (N00Source == N10Source && N01Source == N11Source &&
15377 N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
15378 assert(N0.getValueType() == N1.getValueType());
15380 uint64_t N00Index = N00.getConstantOperandVal(1);
15381 uint64_t N01Index = N01.getConstantOperandVal(1);
15382 uint64_t N10Index = N10.getConstantOperandVal(1);
15383 uint64_t N11Index = N11.getConstantOperandVal(1);
15385 if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
15386 N10Index == N00VT.getVectorNumElements())
15387 return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
15392 // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
15393 // splat. The indexed instructions are going to be expecting a DUPLANE64, so
15394 // canonicalise to that.
15395 if (N->getNumOperands() == 2 && N0 == N1 && VT.getVectorNumElements() == 2) {
15396 assert(VT.getScalarSizeInBits() == 64);
15397 return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
15398 DAG.getConstant(0, dl, MVT::i64));
15401 // Canonicalise concat_vectors so that the right-hand vector has as few
15402 // bit-casts as possible before its real operation. The primary matching
15403 // destination for these operations will be the narrowing "2" instructions,
15404 // which depend on the operation being performed on this right-hand vector.
15406 // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
15408 // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
15410 if (N->getNumOperands() != 2 || N1Opc != ISD::BITCAST)
15412 SDValue RHS = N1->getOperand(0);
15413 MVT RHSTy = RHS.getValueType().getSimpleVT();
15414 // If the RHS is not a vector, this is not the pattern we're looking for.
15415 if (!RHSTy.isVector())
15419 dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
15421 MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
15422 RHSTy.getVectorNumElements() * 2);
15423 return DAG.getNode(ISD::BITCAST, dl, VT,
15424 DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
15425 DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
15430 performExtractSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15431 SelectionDAG &DAG) {
15432 if (DCI.isBeforeLegalizeOps())
15435 EVT VT = N->getValueType(0);
15436 if (!VT.isScalableVector() || VT.getVectorElementType() != MVT::i1)
15439 SDValue V = N->getOperand(0);
15441 // NOTE: This combine exists in DAGCombiner, but that version's legality check
15442 // blocks this combine because the non-const case requires custom lowering.
15444 // ty1 extract_vector(ty2 splat(const))) -> ty1 splat(const)
15445 if (V.getOpcode() == ISD::SPLAT_VECTOR)
15446 if (isa<ConstantSDNode>(V.getOperand(0)))
15447 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V.getOperand(0));
15453 performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
15454 SelectionDAG &DAG) {
15456 SDValue Vec = N->getOperand(0);
15457 SDValue SubVec = N->getOperand(1);
15458 uint64_t IdxVal = N->getConstantOperandVal(2);
15459 EVT VecVT = Vec.getValueType();
15460 EVT SubVT = SubVec.getValueType();
15462 // Only do this for legal fixed vector types.
15463 if (!VecVT.isFixedLengthVector() ||
15464 !DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15465 !DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
15468 // Ignore widening patterns.
15469 if (IdxVal == 0 && Vec.isUndef())
15472 // Subvector must be half the width and an "aligned" insertion.
15473 unsigned NumSubElts = SubVT.getVectorNumElements();
15474 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15475 (IdxVal != 0 && IdxVal != NumSubElts))
15478 // Fold insert_subvector -> concat_vectors
15479 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15480 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15484 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15485 DAG.getVectorIdxConstant(NumSubElts, DL));
15487 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15488 DAG.getVectorIdxConstant(0, DL));
15491 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15494 static SDValue tryCombineFixedPointConvert(SDNode *N,
15495 TargetLowering::DAGCombinerInfo &DCI,
15496 SelectionDAG &DAG) {
15497 // Wait until after everything is legalized to try this. That way we have
15498 // legal vector types and such.
15499 if (DCI.isBeforeLegalizeOps())
15501 // Transform a scalar conversion of a value from a lane extract into a
15502 // lane extract of a vector conversion. E.g., from foo1 to foo2:
15503 // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
15504 // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
15506 // The second form interacts better with instruction selection and the
15507 // register allocator to avoid cross-class register copies that aren't
15508 // coalescable due to a lane reference.
15510 // Check the operand and see if it originates from a lane extract.
15511 SDValue Op1 = N->getOperand(1);
15512 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15515 // Yep, no additional predication needed. Perform the transform.
15516 SDValue IID = N->getOperand(0);
15517 SDValue Shift = N->getOperand(2);
15518 SDValue Vec = Op1.getOperand(0);
15519 SDValue Lane = Op1.getOperand(1);
15520 EVT ResTy = N->getValueType(0);
15524 // The vector width should be 128 bits by the time we get here, even
15525 // if it started as 64 bits (the extract_vector handling will have
15526 // done so). Bail if it is not.
15527 if (Vec.getValueSizeInBits() != 128)
15530 if (Vec.getValueType() == MVT::v4i32)
15531 VecResTy = MVT::v4f32;
15532 else if (Vec.getValueType() == MVT::v2i64)
15533 VecResTy = MVT::v2f64;
15538 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
15539 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
15542 // AArch64 high-vector "long" operations are formed by performing the non-high
15543 // version on an extract_subvector of each operand which gets the high half:
15545 // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
15547 // However, there are cases which don't have an extract_high explicitly, but
15548 // have another operation that can be made compatible with one for free. For
15551 // (dupv64 scalar) --> (extract_high (dup128 scalar))
15553 // This routine does the actual conversion of such DUPs, once outer routines
15554 // have determined that everything else is in order.
15555 // It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
15557 static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
15558 MVT VT = N.getSimpleValueType();
15559 if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
15560 N.getConstantOperandVal(1) == 0)
15561 N = N.getOperand(0);
15563 switch (N.getOpcode()) {
15564 case AArch64ISD::DUP:
15565 case AArch64ISD::DUPLANE8:
15566 case AArch64ISD::DUPLANE16:
15567 case AArch64ISD::DUPLANE32:
15568 case AArch64ISD::DUPLANE64:
15569 case AArch64ISD::MOVI:
15570 case AArch64ISD::MOVIshift:
15571 case AArch64ISD::MOVIedit:
15572 case AArch64ISD::MOVImsl:
15573 case AArch64ISD::MVNIshift:
15574 case AArch64ISD::MVNImsl:
15577 // FMOV could be supported, but isn't very useful, as it would only occur
15578 // if you passed a bitcast' floating point immediate to an eligible long
15579 // integer op (addl, smull, ...).
15583 if (!VT.is64BitVector())
15587 unsigned NumElems = VT.getVectorNumElements();
15588 if (N.getValueType().is64BitVector()) {
15589 MVT ElementTy = VT.getVectorElementType();
15590 MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);
15591 N = DAG.getNode(N->getOpcode(), DL, NewVT, N->ops());
15594 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N,
15595 DAG.getConstant(NumElems, DL, MVT::i64));
15598 static bool isEssentiallyExtractHighSubvector(SDValue N) {
15599 if (N.getOpcode() == ISD::BITCAST)
15600 N = N.getOperand(0);
15601 if (N.getOpcode() != ISD::EXTRACT_SUBVECTOR)
15603 if (N.getOperand(0).getValueType().isScalableVector())
15605 return cast<ConstantSDNode>(N.getOperand(1))->getAPIntValue() ==
15606 N.getOperand(0).getValueType().getVectorNumElements() / 2;
15609 /// Helper structure to keep track of ISD::SET_CC operands.
15610 struct GenericSetCCInfo {
15611 const SDValue *Opnd0;
15612 const SDValue *Opnd1;
15616 /// Helper structure to keep track of a SET_CC lowered into AArch64 code.
15617 struct AArch64SetCCInfo {
15618 const SDValue *Cmp;
15619 AArch64CC::CondCode CC;
15622 /// Helper structure to keep track of SetCC information.
15624 GenericSetCCInfo Generic;
15625 AArch64SetCCInfo AArch64;
15628 /// Helper structure to be able to read SetCC information. If set to
15629 /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
15630 /// GenericSetCCInfo.
15631 struct SetCCInfoAndKind {
15636 /// Check whether or not \p Op is a SET_CC operation, either a generic or
15638 /// AArch64 lowered one.
15639 /// \p SetCCInfo is filled accordingly.
15640 /// \post SetCCInfo is meanginfull only when this function returns true.
15641 /// \return True when Op is a kind of SET_CC operation.
15642 static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
15643 // If this is a setcc, this is straight forward.
15644 if (Op.getOpcode() == ISD::SETCC) {
15645 SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
15646 SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
15647 SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
15648 SetCCInfo.IsAArch64 = false;
15651 // Otherwise, check if this is a matching csel instruction.
15654 // - csel 0, 1, !cc
15655 if (Op.getOpcode() != AArch64ISD::CSEL)
15657 // Set the information about the operands.
15658 // TODO: we want the operands of the Cmp not the csel
15659 SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
15660 SetCCInfo.IsAArch64 = true;
15661 SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
15662 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
15664 // Check that the operands matches the constraints:
15665 // (1) Both operands must be constants.
15666 // (2) One must be 1 and the other must be 0.
15667 ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
15668 ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
15671 if (!TValue || !FValue)
15675 if (!TValue->isOne()) {
15676 // Update the comparison when we are interested in !cc.
15677 std::swap(TValue, FValue);
15678 SetCCInfo.Info.AArch64.CC =
15679 AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
15681 return TValue->isOne() && FValue->isZero();
15684 // Returns true if Op is setcc or zext of setcc.
15685 static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
15686 if (isSetCC(Op, Info))
15688 return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
15689 isSetCC(Op->getOperand(0), Info));
15692 // The folding we want to perform is:
15693 // (add x, [zext] (setcc cc ...) )
15695 // (csel x, (add x, 1), !cc ...)
15697 // The latter will get matched to a CSINC instruction.
15698 static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
15699 assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
15700 SDValue LHS = Op->getOperand(0);
15701 SDValue RHS = Op->getOperand(1);
15702 SetCCInfoAndKind InfoAndKind;
15704 // If both operands are a SET_CC, then we don't want to perform this
15705 // folding and create another csel as this results in more instructions
15706 // (and higher register usage).
15707 if (isSetCCOrZExtSetCC(LHS, InfoAndKind) &&
15708 isSetCCOrZExtSetCC(RHS, InfoAndKind))
15711 // If neither operand is a SET_CC, give up.
15712 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
15713 std::swap(LHS, RHS);
15714 if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
15718 // FIXME: This could be generatized to work for FP comparisons.
15719 EVT CmpVT = InfoAndKind.IsAArch64
15720 ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
15721 : InfoAndKind.Info.Generic.Opnd0->getValueType();
15722 if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
15728 if (InfoAndKind.IsAArch64) {
15729 CCVal = DAG.getConstant(
15730 AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
15732 Cmp = *InfoAndKind.Info.AArch64.Cmp;
15734 Cmp = getAArch64Cmp(
15735 *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
15736 ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
15739 EVT VT = Op->getValueType(0);
15740 LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
15741 return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
15744 // ADD(UADDV a, UADDV b) --> UADDV(ADD a, b)
15745 static SDValue performAddUADDVCombine(SDNode *N, SelectionDAG &DAG) {
15746 EVT VT = N->getValueType(0);
15747 // Only scalar integer and vector types.
15748 if (N->getOpcode() != ISD::ADD || !VT.isScalarInteger())
15751 SDValue LHS = N->getOperand(0);
15752 SDValue RHS = N->getOperand(1);
15753 if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15754 RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || LHS.getValueType() != VT)
15757 auto *LHSN1 = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
15758 auto *RHSN1 = dyn_cast<ConstantSDNode>(RHS->getOperand(1));
15759 if (!LHSN1 || LHSN1 != RHSN1 || !RHSN1->isZero())
15762 SDValue Op1 = LHS->getOperand(0);
15763 SDValue Op2 = RHS->getOperand(0);
15764 EVT OpVT1 = Op1.getValueType();
15765 EVT OpVT2 = Op2.getValueType();
15766 if (Op1.getOpcode() != AArch64ISD::UADDV || OpVT1 != OpVT2 ||
15767 Op2.getOpcode() != AArch64ISD::UADDV ||
15768 OpVT1.getVectorElementType() != VT)
15771 SDValue Val1 = Op1.getOperand(0);
15772 SDValue Val2 = Op2.getOperand(0);
15773 EVT ValVT = Val1->getValueType(0);
15775 SDValue AddVal = DAG.getNode(ISD::ADD, DL, ValVT, Val1, Val2);
15776 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
15777 DAG.getNode(AArch64ISD::UADDV, DL, ValVT, AddVal),
15778 DAG.getConstant(0, DL, MVT::i64));
15781 /// Perform the scalar expression combine in the form of:
15782 /// CSEL(c, 1, cc) + b => CSINC(b+c, b, cc)
15783 /// CSNEG(c, -1, cc) + b => CSINC(b+c, b, cc)
15784 static SDValue performAddCSelIntoCSinc(SDNode *N, SelectionDAG &DAG) {
15785 EVT VT = N->getValueType(0);
15786 if (!VT.isScalarInteger() || N->getOpcode() != ISD::ADD)
15789 SDValue LHS = N->getOperand(0);
15790 SDValue RHS = N->getOperand(1);
15792 // Handle commutivity.
15793 if (LHS.getOpcode() != AArch64ISD::CSEL &&
15794 LHS.getOpcode() != AArch64ISD::CSNEG) {
15795 std::swap(LHS, RHS);
15796 if (LHS.getOpcode() != AArch64ISD::CSEL &&
15797 LHS.getOpcode() != AArch64ISD::CSNEG) {
15802 if (!LHS.hasOneUse())
15805 AArch64CC::CondCode AArch64CC =
15806 static_cast<AArch64CC::CondCode>(LHS.getConstantOperandVal(2));
15808 // The CSEL should include a const one operand, and the CSNEG should include
15809 // One or NegOne operand.
15810 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(LHS.getOperand(0));
15811 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
15812 if (!CTVal || !CFVal)
15815 if (!(LHS.getOpcode() == AArch64ISD::CSEL &&
15816 (CTVal->isOne() || CFVal->isOne())) &&
15817 !(LHS.getOpcode() == AArch64ISD::CSNEG &&
15818 (CTVal->isOne() || CFVal->isAllOnes())))
15821 // Switch CSEL(1, c, cc) to CSEL(c, 1, !cc)
15822 if (LHS.getOpcode() == AArch64ISD::CSEL && CTVal->isOne() &&
15824 std::swap(CTVal, CFVal);
15825 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
15829 // Switch CSNEG(1, c, cc) to CSNEG(-c, -1, !cc)
15830 if (LHS.getOpcode() == AArch64ISD::CSNEG && CTVal->isOne() &&
15831 !CFVal->isAllOnes()) {
15832 APInt C = -1 * CFVal->getAPIntValue();
15833 CTVal = cast<ConstantSDNode>(DAG.getConstant(C, DL, VT));
15834 CFVal = cast<ConstantSDNode>(DAG.getAllOnesConstant(DL, VT));
15835 AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
15838 // It might be neutral for larger constants, as the immediate need to be
15839 // materialized in a register.
15840 APInt ADDC = CTVal->getAPIntValue();
15841 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15842 if (!TLI.isLegalAddImmediate(ADDC.getSExtValue()))
15845 assert(((LHS.getOpcode() == AArch64ISD::CSEL && CFVal->isOne()) ||
15846 (LHS.getOpcode() == AArch64ISD::CSNEG && CFVal->isAllOnes())) &&
15847 "Unexpected constant value");
15849 SDValue NewNode = DAG.getNode(ISD::ADD, DL, VT, RHS, SDValue(CTVal, 0));
15850 SDValue CCVal = DAG.getConstant(AArch64CC, DL, MVT::i32);
15851 SDValue Cmp = LHS.getOperand(3);
15853 return DAG.getNode(AArch64ISD::CSINC, DL, VT, NewNode, RHS, CCVal, Cmp);
15856 // ADD(UDOT(zero, x, y), A) --> UDOT(A, x, y)
15857 static SDValue performAddDotCombine(SDNode *N, SelectionDAG &DAG) {
15858 EVT VT = N->getValueType(0);
15859 if (N->getOpcode() != ISD::ADD)
15862 SDValue Dot = N->getOperand(0);
15863 SDValue A = N->getOperand(1);
15864 // Handle commutivity
15865 auto isZeroDot = [](SDValue Dot) {
15866 return (Dot.getOpcode() == AArch64ISD::UDOT ||
15867 Dot.getOpcode() == AArch64ISD::SDOT) &&
15868 isZerosVector(Dot.getOperand(0).getNode());
15870 if (!isZeroDot(Dot))
15872 if (!isZeroDot(Dot))
15875 return DAG.getNode(Dot.getOpcode(), SDLoc(N), VT, A, Dot.getOperand(1),
15876 Dot.getOperand(2));
15879 static bool isNegatedInteger(SDValue Op) {
15880 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0));
15883 static SDValue getNegatedInteger(SDValue Op, SelectionDAG &DAG) {
15885 EVT VT = Op.getValueType();
15886 SDValue Zero = DAG.getConstant(0, DL, VT);
15887 return DAG.getNode(ISD::SUB, DL, VT, Zero, Op);
15892 // (neg (csel X, Y)) -> (csel (neg X), (neg Y))
15894 // The folding helps csel to be matched with csneg without generating
15895 // redundant neg instruction, which includes negation of the csel expansion
15896 // of abs node lowered by lowerABS.
15897 static SDValue performNegCSelCombine(SDNode *N, SelectionDAG &DAG) {
15898 if (!isNegatedInteger(SDValue(N, 0)))
15901 SDValue CSel = N->getOperand(1);
15902 if (CSel.getOpcode() != AArch64ISD::CSEL || !CSel->hasOneUse())
15905 SDValue N0 = CSel.getOperand(0);
15906 SDValue N1 = CSel.getOperand(1);
15908 // If both of them is not negations, it's not worth the folding as it
15909 // introduces two additional negations while reducing one negation.
15910 if (!isNegatedInteger(N0) && !isNegatedInteger(N1))
15913 SDValue N0N = getNegatedInteger(N0, DAG);
15914 SDValue N1N = getNegatedInteger(N1, DAG);
15917 EVT VT = CSel.getValueType();
15918 return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0N, N1N, CSel.getOperand(2),
15919 CSel.getOperand(3));
15922 // The basic add/sub long vector instructions have variants with "2" on the end
15923 // which act on the high-half of their inputs. They are normally matched by
15926 // (add (zeroext (extract_high LHS)),
15927 // (zeroext (extract_high RHS)))
15928 // -> uaddl2 vD, vN, vM
15930 // However, if one of the extracts is something like a duplicate, this
15931 // instruction can still be used profitably. This function puts the DAG into a
15932 // more appropriate form for those patterns to trigger.
15933 static SDValue performAddSubLongCombine(SDNode *N,
15934 TargetLowering::DAGCombinerInfo &DCI,
15935 SelectionDAG &DAG) {
15936 if (DCI.isBeforeLegalizeOps())
15939 MVT VT = N->getSimpleValueType(0);
15940 if (!VT.is128BitVector()) {
15941 if (N->getOpcode() == ISD::ADD)
15942 return performSetccAddFolding(N, DAG);
15946 // Make sure both branches are extended in the same way.
15947 SDValue LHS = N->getOperand(0);
15948 SDValue RHS = N->getOperand(1);
15949 if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
15950 LHS.getOpcode() != ISD::SIGN_EXTEND) ||
15951 LHS.getOpcode() != RHS.getOpcode())
15954 unsigned ExtType = LHS.getOpcode();
15956 // It's not worth doing if at least one of the inputs isn't already an
15957 // extract, but we don't know which it'll be so we have to try both.
15958 if (isEssentiallyExtractHighSubvector(LHS.getOperand(0))) {
15959 RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
15960 if (!RHS.getNode())
15963 RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
15964 } else if (isEssentiallyExtractHighSubvector(RHS.getOperand(0))) {
15965 LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
15966 if (!LHS.getNode())
15969 LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
15972 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
15975 static bool isCMP(SDValue Op) {
15976 return Op.getOpcode() == AArch64ISD::SUBS &&
15977 !Op.getNode()->hasAnyUseOfValue(0);
15980 // (CSEL 1 0 CC Cond) => CC
15981 // (CSEL 0 1 CC Cond) => !CC
15982 static Optional<AArch64CC::CondCode> getCSETCondCode(SDValue Op) {
15983 if (Op.getOpcode() != AArch64ISD::CSEL)
15985 auto CC = static_cast<AArch64CC::CondCode>(Op.getConstantOperandVal(2));
15986 if (CC == AArch64CC::AL || CC == AArch64CC::NV)
15988 SDValue OpLHS = Op.getOperand(0);
15989 SDValue OpRHS = Op.getOperand(1);
15990 if (isOneConstant(OpLHS) && isNullConstant(OpRHS))
15992 if (isNullConstant(OpLHS) && isOneConstant(OpRHS))
15993 return getInvertedCondCode(CC);
15998 // (ADC{S} l r (CMP (CSET HS carry) 1)) => (ADC{S} l r carry)
15999 // (SBC{S} l r (CMP 0 (CSET LO carry))) => (SBC{S} l r carry)
16000 static SDValue foldOverflowCheck(SDNode *Op, SelectionDAG &DAG, bool IsAdd) {
16001 SDValue CmpOp = Op->getOperand(2);
16006 if (!isOneConstant(CmpOp.getOperand(1)))
16009 if (!isNullConstant(CmpOp.getOperand(0)))
16013 SDValue CsetOp = CmpOp->getOperand(IsAdd ? 0 : 1);
16014 auto CC = getCSETCondCode(CsetOp);
16015 if (CC != (IsAdd ? AArch64CC::HS : AArch64CC::LO))
16018 return DAG.getNode(Op->getOpcode(), SDLoc(Op), Op->getVTList(),
16019 Op->getOperand(0), Op->getOperand(1),
16020 CsetOp.getOperand(3));
16023 // (ADC x 0 cond) => (CINC x HS cond)
16024 static SDValue foldADCToCINC(SDNode *N, SelectionDAG &DAG) {
16025 SDValue LHS = N->getOperand(0);
16026 SDValue RHS = N->getOperand(1);
16027 SDValue Cond = N->getOperand(2);
16029 if (!isNullConstant(RHS))
16032 EVT VT = N->getValueType(0);
16035 // (CINC x cc cond) <=> (CSINC x x !cc cond)
16036 SDValue CC = DAG.getConstant(AArch64CC::LO, DL, MVT::i32);
16037 return DAG.getNode(AArch64ISD::CSINC, DL, VT, LHS, LHS, CC, Cond);
16040 // Transform vector add(zext i8 to i32, zext i8 to i32)
16041 // into sext(add(zext(i8 to i16), zext(i8 to i16)) to i32)
16042 // This allows extra uses of saddl/uaddl at the lower vector widths, and less
16044 static SDValue performVectorAddSubExtCombine(SDNode *N, SelectionDAG &DAG) {
16045 EVT VT = N->getValueType(0);
16046 if (!VT.isFixedLengthVector() || VT.getSizeInBits() <= 128 ||
16047 (N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
16048 N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND) ||
16049 (N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
16050 N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND) ||
16051 N->getOperand(0).getOperand(0).getValueType() !=
16052 N->getOperand(1).getOperand(0).getValueType())
16055 SDValue N0 = N->getOperand(0).getOperand(0);
16056 SDValue N1 = N->getOperand(1).getOperand(0);
16057 EVT InVT = N0.getValueType();
16059 EVT S1 = InVT.getScalarType();
16060 EVT S2 = VT.getScalarType();
16061 if ((S2 == MVT::i32 && S1 == MVT::i8) ||
16062 (S2 == MVT::i64 && (S1 == MVT::i8 || S1 == MVT::i16))) {
16064 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(),
16065 S2.getHalfSizedIntegerVT(*DAG.getContext()),
16066 VT.getVectorElementCount());
16067 SDValue NewN0 = DAG.getNode(N->getOperand(0).getOpcode(), DL, HalfVT, N0);
16068 SDValue NewN1 = DAG.getNode(N->getOperand(1).getOpcode(), DL, HalfVT, N1);
16069 SDValue NewOp = DAG.getNode(N->getOpcode(), DL, HalfVT, NewN0, NewN1);
16070 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewOp);
16075 static SDValue performBuildVectorCombine(SDNode *N,
16076 TargetLowering::DAGCombinerInfo &DCI,
16077 SelectionDAG &DAG) {
16080 // A build vector of two extracted elements is equivalent to an
16081 // extract subvector where the inner vector is any-extended to the
16082 // extract_vector_elt VT.
16083 // (build_vector (extract_elt_iXX_to_i32 vec Idx+0)
16084 // (extract_elt_iXX_to_i32 vec Idx+1))
16085 // => (extract_subvector (anyext_iXX_to_i32 vec) Idx)
16087 // For now, only consider the v2i32 case, which arises as a result of
16089 if (N->getValueType(0) != MVT::v2i32)
16092 SDValue Elt0 = N->getOperand(0), Elt1 = N->getOperand(1);
16093 // Reminder, EXTRACT_VECTOR_ELT has the effect of any-extending to its VT.
16094 if (Elt0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
16095 Elt1->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
16097 isa<ConstantSDNode>(Elt0->getOperand(1)) &&
16098 isa<ConstantSDNode>(Elt1->getOperand(1)) &&
16099 // Both EXTRACT_VECTOR_ELT from same vector...
16100 Elt0->getOperand(0) == Elt1->getOperand(0) &&
16101 // ... and contiguous. First element's index +1 == second element's index.
16102 Elt0->getConstantOperandVal(1) + 1 == Elt1->getConstantOperandVal(1)) {
16103 SDValue VecToExtend = Elt0->getOperand(0);
16104 EVT ExtVT = VecToExtend.getValueType().changeVectorElementType(MVT::i32);
16105 if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtVT))
16108 SDValue SubvectorIdx = DAG.getVectorIdxConstant(Elt0->getConstantOperandVal(1), DL);
16110 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, DL, ExtVT, VecToExtend);
16111 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i32, Ext,
16118 static SDValue performAddSubCombine(SDNode *N,
16119 TargetLowering::DAGCombinerInfo &DCI,
16120 SelectionDAG &DAG) {
16121 // Try to change sum of two reductions.
16122 if (SDValue Val = performAddUADDVCombine(N, DAG))
16124 if (SDValue Val = performAddDotCombine(N, DAG))
16126 if (SDValue Val = performAddCSelIntoCSinc(N, DAG))
16128 if (SDValue Val = performNegCSelCombine(N, DAG))
16130 if (SDValue Val = performVectorAddSubExtCombine(N, DAG))
16133 return performAddSubLongCombine(N, DCI, DAG);
16136 // Massage DAGs which we can use the high-half "long" operations on into
16137 // something isel will recognize better. E.g.
16139 // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
16140 // (aarch64_neon_umull (extract_high (v2i64 vec)))
16141 // (extract_high (v2i64 (dup128 scalar)))))
16143 static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
16144 TargetLowering::DAGCombinerInfo &DCI,
16145 SelectionDAG &DAG) {
16146 if (DCI.isBeforeLegalizeOps())
16149 SDValue LHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 0 : 1);
16150 SDValue RHS = N->getOperand((IID == Intrinsic::not_intrinsic) ? 1 : 2);
16151 assert(LHS.getValueType().is64BitVector() &&
16152 RHS.getValueType().is64BitVector() &&
16153 "unexpected shape for long operation");
16155 // Either node could be a DUP, but it's not worth doing both of them (you'd
16156 // just as well use the non-high version) so look for a corresponding extract
16157 // operation on the other "wing".
16158 if (isEssentiallyExtractHighSubvector(LHS)) {
16159 RHS = tryExtendDUPToExtractHigh(RHS, DAG);
16160 if (!RHS.getNode())
16162 } else if (isEssentiallyExtractHighSubvector(RHS)) {
16163 LHS = tryExtendDUPToExtractHigh(LHS, DAG);
16164 if (!LHS.getNode())
16168 if (IID == Intrinsic::not_intrinsic)
16169 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), LHS, RHS);
16171 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
16172 N->getOperand(0), LHS, RHS);
16175 static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
16176 MVT ElemTy = N->getSimpleValueType(0).getScalarType();
16177 unsigned ElemBits = ElemTy.getSizeInBits();
16179 int64_t ShiftAmount;
16180 if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
16181 APInt SplatValue, SplatUndef;
16182 unsigned SplatBitSize;
16184 if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
16185 HasAnyUndefs, ElemBits) ||
16186 SplatBitSize != ElemBits)
16189 ShiftAmount = SplatValue.getSExtValue();
16190 } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
16191 ShiftAmount = CVN->getSExtValue();
16199 llvm_unreachable("Unknown shift intrinsic");
16200 case Intrinsic::aarch64_neon_sqshl:
16201 Opcode = AArch64ISD::SQSHL_I;
16202 IsRightShift = false;
16204 case Intrinsic::aarch64_neon_uqshl:
16205 Opcode = AArch64ISD::UQSHL_I;
16206 IsRightShift = false;
16208 case Intrinsic::aarch64_neon_srshl:
16209 Opcode = AArch64ISD::SRSHR_I;
16210 IsRightShift = true;
16212 case Intrinsic::aarch64_neon_urshl:
16213 Opcode = AArch64ISD::URSHR_I;
16214 IsRightShift = true;
16216 case Intrinsic::aarch64_neon_sqshlu:
16217 Opcode = AArch64ISD::SQSHLU_I;
16218 IsRightShift = false;
16220 case Intrinsic::aarch64_neon_sshl:
16221 case Intrinsic::aarch64_neon_ushl:
16222 // For positive shift amounts we can use SHL, as ushl/sshl perform a regular
16223 // left shift for positive shift amounts. Below, we only replace the current
16224 // node with VSHL, if this condition is met.
16225 Opcode = AArch64ISD::VSHL;
16226 IsRightShift = false;
16230 if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
16232 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
16233 DAG.getConstant(-ShiftAmount, dl, MVT::i32));
16234 } else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
16236 return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
16237 DAG.getConstant(ShiftAmount, dl, MVT::i32));
16243 // The CRC32[BH] instructions ignore the high bits of their data operand. Since
16244 // the intrinsics must be legal and take an i32, this means there's almost
16245 // certainly going to be a zext in the DAG which we can eliminate.
16246 static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
16247 SDValue AndN = N->getOperand(2);
16248 if (AndN.getOpcode() != ISD::AND)
16251 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
16252 if (!CMask || CMask->getZExtValue() != Mask)
16255 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
16256 N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
16259 static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
16260 SelectionDAG &DAG) {
16262 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
16263 DAG.getNode(Opc, dl,
16264 N->getOperand(1).getSimpleValueType(),
16266 DAG.getConstant(0, dl, MVT::i64));
16269 static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
16271 SDValue Op1 = N->getOperand(1);
16272 SDValue Op2 = N->getOperand(2);
16273 EVT ScalarTy = Op2.getValueType();
16274 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
16275 ScalarTy = MVT::i32;
16277 // Lower index_vector(base, step) to mul(step step_vector(1)) + splat(base).
16278 SDValue StepVector = DAG.getStepVector(DL, N->getValueType(0));
16279 SDValue Step = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op2);
16280 SDValue Mul = DAG.getNode(ISD::MUL, DL, N->getValueType(0), StepVector, Step);
16281 SDValue Base = DAG.getNode(ISD::SPLAT_VECTOR, DL, N->getValueType(0), Op1);
16282 return DAG.getNode(ISD::ADD, DL, N->getValueType(0), Mul, Base);
16285 static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
16287 SDValue Scalar = N->getOperand(3);
16288 EVT ScalarTy = Scalar.getValueType();
16290 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
16291 Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
16293 SDValue Passthru = N->getOperand(1);
16294 SDValue Pred = N->getOperand(2);
16295 return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
16296 Pred, Scalar, Passthru);
16299 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
16301 LLVMContext &Ctx = *DAG.getContext();
16302 EVT VT = N->getValueType(0);
16304 assert(VT.isScalableVector() && "Expected a scalable vector.");
16306 // Current lowering only supports the SVE-ACLE types.
16307 if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
16310 unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
16311 unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
16313 EVT::getVectorVT(Ctx, MVT::i8, ElementCount::getScalable(ByteSize));
16315 // Convert everything to the domain of EXT (i.e bytes).
16316 SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
16317 SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
16318 SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
16319 DAG.getConstant(ElemSize, dl, MVT::i32));
16321 SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
16322 return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
16325 static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
16326 TargetLowering::DAGCombinerInfo &DCI,
16327 SelectionDAG &DAG) {
16328 if (DCI.isBeforeLegalize())
16331 SDValue Comparator = N->getOperand(3);
16332 if (Comparator.getOpcode() == AArch64ISD::DUP ||
16333 Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
16334 unsigned IID = getIntrinsicID(N);
16335 EVT VT = N->getValueType(0);
16336 EVT CmpVT = N->getOperand(2).getValueType();
16337 SDValue Pred = N->getOperand(1);
16343 llvm_unreachable("Called with wrong intrinsic!");
16346 // Signed comparisons
16347 case Intrinsic::aarch64_sve_cmpeq_wide:
16348 case Intrinsic::aarch64_sve_cmpne_wide:
16349 case Intrinsic::aarch64_sve_cmpge_wide:
16350 case Intrinsic::aarch64_sve_cmpgt_wide:
16351 case Intrinsic::aarch64_sve_cmplt_wide:
16352 case Intrinsic::aarch64_sve_cmple_wide: {
16353 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
16354 int64_t ImmVal = CN->getSExtValue();
16355 if (ImmVal >= -16 && ImmVal <= 15)
16356 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
16362 // Unsigned comparisons
16363 case Intrinsic::aarch64_sve_cmphs_wide:
16364 case Intrinsic::aarch64_sve_cmphi_wide:
16365 case Intrinsic::aarch64_sve_cmplo_wide:
16366 case Intrinsic::aarch64_sve_cmpls_wide: {
16367 if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
16368 uint64_t ImmVal = CN->getZExtValue();
16370 Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
16381 SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
16382 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
16383 N->getOperand(2), Splat, DAG.getCondCode(CC));
16389 static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
16390 AArch64CC::CondCode Cond) {
16391 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16394 assert(Op.getValueType().isScalableVector() &&
16395 TLI.isTypeLegal(Op.getValueType()) &&
16396 "Expected legal scalable vector type!");
16397 assert(Op.getValueType() == Pg.getValueType() &&
16398 "Expected same type for PTEST operands");
16400 // Ensure target specific opcodes are using legal type.
16401 EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
16402 SDValue TVal = DAG.getConstant(1, DL, OutVT);
16403 SDValue FVal = DAG.getConstant(0, DL, OutVT);
16405 // Ensure operands have type nxv16i1.
16406 if (Op.getValueType() != MVT::nxv16i1) {
16407 if ((Cond == AArch64CC::ANY_ACTIVE || Cond == AArch64CC::NONE_ACTIVE) &&
16408 isZeroingInactiveLanes(Op))
16409 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Pg);
16411 Pg = getSVEPredicateBitCast(MVT::nxv16i1, Pg, DAG);
16412 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv16i1, Op);
16415 // Set condition code (CC) flags.
16416 SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
16418 // Convert CC to integer based on requested condition.
16419 // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
16420 SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
16421 SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
16422 return DAG.getZExtOrTrunc(Res, DL, VT);
16425 static SDValue combineSVEReductionInt(SDNode *N, unsigned Opc,
16426 SelectionDAG &DAG) {
16429 SDValue Pred = N->getOperand(1);
16430 SDValue VecToReduce = N->getOperand(2);
16432 // NOTE: The integer reduction's result type is not always linked to the
16433 // operand's element type so we construct it from the intrinsic's result type.
16434 EVT ReduceVT = getPackedSVEVectorVT(N->getValueType(0));
16435 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
16437 // SVE reductions set the whole vector register with the first element
16438 // containing the reduction result, which we'll now extract.
16439 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
16440 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
16444 static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
16445 SelectionDAG &DAG) {
16448 SDValue Pred = N->getOperand(1);
16449 SDValue VecToReduce = N->getOperand(2);
16451 EVT ReduceVT = VecToReduce.getValueType();
16452 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
16454 // SVE reductions set the whole vector register with the first element
16455 // containing the reduction result, which we'll now extract.
16456 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
16457 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
16461 static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
16462 SelectionDAG &DAG) {
16465 SDValue Pred = N->getOperand(1);
16466 SDValue InitVal = N->getOperand(2);
16467 SDValue VecToReduce = N->getOperand(3);
16468 EVT ReduceVT = VecToReduce.getValueType();
16470 // Ordered reductions use the first lane of the result vector as the
16471 // reduction's initial value.
16472 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
16473 InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
16474 DAG.getUNDEF(ReduceVT), InitVal, Zero);
16476 SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
16478 // SVE reductions set the whole vector register with the first element
16479 // containing the reduction result, which we'll now extract.
16480 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
16484 static bool isAllInactivePredicate(SDValue N) {
16485 // Look through cast.
16486 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST)
16487 N = N.getOperand(0);
16489 return ISD::isConstantSplatVectorAllZeros(N.getNode());
16492 static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N) {
16493 unsigned NumElts = N.getValueType().getVectorMinNumElements();
16495 // Look through cast.
16496 while (N.getOpcode() == AArch64ISD::REINTERPRET_CAST) {
16497 N = N.getOperand(0);
16498 // When reinterpreting from a type with fewer elements the "new" elements
16499 // are not active, so bail if they're likely to be used.
16500 if (N.getValueType().getVectorMinNumElements() < NumElts)
16504 if (ISD::isConstantSplatVectorAllOnes(N.getNode()))
16507 // "ptrue p.<ty>, all" can be considered all active when <ty> is the same size
16508 // or smaller than the implicit element type represented by N.
16509 // NOTE: A larger element count implies a smaller element type.
16510 if (N.getOpcode() == AArch64ISD::PTRUE &&
16511 N.getConstantOperandVal(0) == AArch64SVEPredPattern::all)
16512 return N.getValueType().getVectorMinNumElements() >= NumElts;
16514 // If we're compiling for a specific vector-length, we can check if the
16515 // pattern's VL equals that of the scalable vector at runtime.
16516 if (N.getOpcode() == AArch64ISD::PTRUE) {
16517 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
16518 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
16519 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
16520 if (MaxSVESize && MinSVESize == MaxSVESize) {
16521 unsigned VScale = MaxSVESize / AArch64::SVEBitsPerBlock;
16522 unsigned PatNumElts =
16523 getNumElementsFromSVEPredPattern(N.getConstantOperandVal(0));
16524 return PatNumElts == (NumElts * VScale);
16531 // If a merged operation has no inactive lanes we can relax it to a predicated
16532 // or unpredicated operation, which potentially allows better isel (perhaps
16533 // using immediate forms) or relaxing register reuse requirements.
16534 static SDValue convertMergedOpToPredOp(SDNode *N, unsigned Opc,
16535 SelectionDAG &DAG, bool UnpredOp = false,
16536 bool SwapOperands = false) {
16537 assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN && "Expected intrinsic!");
16538 assert(N->getNumOperands() == 4 && "Expected 3 operand intrinsic!");
16539 SDValue Pg = N->getOperand(1);
16540 SDValue Op1 = N->getOperand(SwapOperands ? 3 : 2);
16541 SDValue Op2 = N->getOperand(SwapOperands ? 2 : 3);
16543 // ISD way to specify an all active predicate.
16544 if (isAllActivePredicate(DAG, Pg)) {
16546 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Op1, Op2);
16548 return DAG.getNode(Opc, SDLoc(N), N->getValueType(0), Pg, Op1, Op2);
16551 // FUTURE: SplatVector(true)
16555 static SDValue performIntrinsicCombine(SDNode *N,
16556 TargetLowering::DAGCombinerInfo &DCI,
16557 const AArch64Subtarget *Subtarget) {
16558 SelectionDAG &DAG = DCI.DAG;
16559 unsigned IID = getIntrinsicID(N);
16563 case Intrinsic::get_active_lane_mask: {
16564 SDValue Res = SDValue();
16565 EVT VT = N->getValueType(0);
16566 if (VT.isFixedLengthVector()) {
16567 // We can use the SVE whilelo instruction to lower this intrinsic by
16568 // creating the appropriate sequence of scalable vector operations and
16569 // then extracting a fixed-width subvector from the scalable vector.
16573 DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, DL, MVT::i64);
16575 EVT WhileVT = EVT::getVectorVT(
16576 *DAG.getContext(), MVT::i1,
16577 ElementCount::getScalable(VT.getVectorNumElements()));
16579 // Get promoted scalable vector VT, i.e. promote nxv4i1 -> nxv4i32.
16580 EVT PromVT = getPromotedVTForPredicate(WhileVT);
16582 // Get the fixed-width equivalent of PromVT for extraction.
16584 EVT::getVectorVT(*DAG.getContext(), PromVT.getVectorElementType(),
16585 VT.getVectorElementCount());
16587 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WhileVT, ID,
16588 N->getOperand(1), N->getOperand(2));
16589 Res = DAG.getNode(ISD::SIGN_EXTEND, DL, PromVT, Res);
16590 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, Res,
16591 DAG.getConstant(0, DL, MVT::i64));
16592 Res = DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
16596 case Intrinsic::aarch64_neon_vcvtfxs2fp:
16597 case Intrinsic::aarch64_neon_vcvtfxu2fp:
16598 return tryCombineFixedPointConvert(N, DCI, DAG);
16599 case Intrinsic::aarch64_neon_saddv:
16600 return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
16601 case Intrinsic::aarch64_neon_uaddv:
16602 return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
16603 case Intrinsic::aarch64_neon_sminv:
16604 return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
16605 case Intrinsic::aarch64_neon_uminv:
16606 return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
16607 case Intrinsic::aarch64_neon_smaxv:
16608 return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
16609 case Intrinsic::aarch64_neon_umaxv:
16610 return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
16611 case Intrinsic::aarch64_neon_fmax:
16612 return DAG.getNode(ISD::FMAXIMUM, SDLoc(N), N->getValueType(0),
16613 N->getOperand(1), N->getOperand(2));
16614 case Intrinsic::aarch64_neon_fmin:
16615 return DAG.getNode(ISD::FMINIMUM, SDLoc(N), N->getValueType(0),
16616 N->getOperand(1), N->getOperand(2));
16617 case Intrinsic::aarch64_neon_fmaxnm:
16618 return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
16619 N->getOperand(1), N->getOperand(2));
16620 case Intrinsic::aarch64_neon_fminnm:
16621 return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
16622 N->getOperand(1), N->getOperand(2));
16623 case Intrinsic::aarch64_neon_smull:
16624 return DAG.getNode(AArch64ISD::SMULL, SDLoc(N), N->getValueType(0),
16625 N->getOperand(1), N->getOperand(2));
16626 case Intrinsic::aarch64_neon_umull:
16627 return DAG.getNode(AArch64ISD::UMULL, SDLoc(N), N->getValueType(0),
16628 N->getOperand(1), N->getOperand(2));
16629 case Intrinsic::aarch64_neon_pmull:
16630 case Intrinsic::aarch64_neon_sqdmull:
16631 return tryCombineLongOpWithDup(IID, N, DCI, DAG);
16632 case Intrinsic::aarch64_neon_sqshl:
16633 case Intrinsic::aarch64_neon_uqshl:
16634 case Intrinsic::aarch64_neon_sqshlu:
16635 case Intrinsic::aarch64_neon_srshl:
16636 case Intrinsic::aarch64_neon_urshl:
16637 case Intrinsic::aarch64_neon_sshl:
16638 case Intrinsic::aarch64_neon_ushl:
16639 return tryCombineShiftImm(IID, N, DAG);
16640 case Intrinsic::aarch64_crc32b:
16641 case Intrinsic::aarch64_crc32cb:
16642 return tryCombineCRC32(0xff, N, DAG);
16643 case Intrinsic::aarch64_crc32h:
16644 case Intrinsic::aarch64_crc32ch:
16645 return tryCombineCRC32(0xffff, N, DAG);
16646 case Intrinsic::aarch64_sve_saddv:
16647 // There is no i64 version of SADDV because the sign is irrelevant.
16648 if (N->getOperand(2)->getValueType(0).getVectorElementType() == MVT::i64)
16649 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
16651 return combineSVEReductionInt(N, AArch64ISD::SADDV_PRED, DAG);
16652 case Intrinsic::aarch64_sve_uaddv:
16653 return combineSVEReductionInt(N, AArch64ISD::UADDV_PRED, DAG);
16654 case Intrinsic::aarch64_sve_smaxv:
16655 return combineSVEReductionInt(N, AArch64ISD::SMAXV_PRED, DAG);
16656 case Intrinsic::aarch64_sve_umaxv:
16657 return combineSVEReductionInt(N, AArch64ISD::UMAXV_PRED, DAG);
16658 case Intrinsic::aarch64_sve_sminv:
16659 return combineSVEReductionInt(N, AArch64ISD::SMINV_PRED, DAG);
16660 case Intrinsic::aarch64_sve_uminv:
16661 return combineSVEReductionInt(N, AArch64ISD::UMINV_PRED, DAG);
16662 case Intrinsic::aarch64_sve_orv:
16663 return combineSVEReductionInt(N, AArch64ISD::ORV_PRED, DAG);
16664 case Intrinsic::aarch64_sve_eorv:
16665 return combineSVEReductionInt(N, AArch64ISD::EORV_PRED, DAG);
16666 case Intrinsic::aarch64_sve_andv:
16667 return combineSVEReductionInt(N, AArch64ISD::ANDV_PRED, DAG);
16668 case Intrinsic::aarch64_sve_index:
16669 return LowerSVEIntrinsicIndex(N, DAG);
16670 case Intrinsic::aarch64_sve_dup:
16671 return LowerSVEIntrinsicDUP(N, DAG);
16672 case Intrinsic::aarch64_sve_dup_x:
16673 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
16675 case Intrinsic::aarch64_sve_ext:
16676 return LowerSVEIntrinsicEXT(N, DAG);
16677 case Intrinsic::aarch64_sve_mul:
16678 return convertMergedOpToPredOp(N, AArch64ISD::MUL_PRED, DAG);
16679 case Intrinsic::aarch64_sve_smulh:
16680 return convertMergedOpToPredOp(N, AArch64ISD::MULHS_PRED, DAG);
16681 case Intrinsic::aarch64_sve_umulh:
16682 return convertMergedOpToPredOp(N, AArch64ISD::MULHU_PRED, DAG);
16683 case Intrinsic::aarch64_sve_smin:
16684 return convertMergedOpToPredOp(N, AArch64ISD::SMIN_PRED, DAG);
16685 case Intrinsic::aarch64_sve_umin:
16686 return convertMergedOpToPredOp(N, AArch64ISD::UMIN_PRED, DAG);
16687 case Intrinsic::aarch64_sve_smax:
16688 return convertMergedOpToPredOp(N, AArch64ISD::SMAX_PRED, DAG);
16689 case Intrinsic::aarch64_sve_umax:
16690 return convertMergedOpToPredOp(N, AArch64ISD::UMAX_PRED, DAG);
16691 case Intrinsic::aarch64_sve_lsl:
16692 return convertMergedOpToPredOp(N, AArch64ISD::SHL_PRED, DAG);
16693 case Intrinsic::aarch64_sve_lsr:
16694 return convertMergedOpToPredOp(N, AArch64ISD::SRL_PRED, DAG);
16695 case Intrinsic::aarch64_sve_asr:
16696 return convertMergedOpToPredOp(N, AArch64ISD::SRA_PRED, DAG);
16697 case Intrinsic::aarch64_sve_fadd:
16698 return convertMergedOpToPredOp(N, AArch64ISD::FADD_PRED, DAG);
16699 case Intrinsic::aarch64_sve_fsub:
16700 return convertMergedOpToPredOp(N, AArch64ISD::FSUB_PRED, DAG);
16701 case Intrinsic::aarch64_sve_fmul:
16702 return convertMergedOpToPredOp(N, AArch64ISD::FMUL_PRED, DAG);
16703 case Intrinsic::aarch64_sve_add:
16704 return convertMergedOpToPredOp(N, ISD::ADD, DAG, true);
16705 case Intrinsic::aarch64_sve_sub:
16706 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true);
16707 case Intrinsic::aarch64_sve_subr:
16708 return convertMergedOpToPredOp(N, ISD::SUB, DAG, true, true);
16709 case Intrinsic::aarch64_sve_and:
16710 return convertMergedOpToPredOp(N, ISD::AND, DAG, true);
16711 case Intrinsic::aarch64_sve_bic:
16712 return convertMergedOpToPredOp(N, AArch64ISD::BIC, DAG, true);
16713 case Intrinsic::aarch64_sve_eor:
16714 return convertMergedOpToPredOp(N, ISD::XOR, DAG, true);
16715 case Intrinsic::aarch64_sve_orr:
16716 return convertMergedOpToPredOp(N, ISD::OR, DAG, true);
16717 case Intrinsic::aarch64_sve_sabd:
16718 return convertMergedOpToPredOp(N, ISD::ABDS, DAG, true);
16719 case Intrinsic::aarch64_sve_uabd:
16720 return convertMergedOpToPredOp(N, ISD::ABDU, DAG, true);
16721 case Intrinsic::aarch64_sve_sqadd:
16722 return convertMergedOpToPredOp(N, ISD::SADDSAT, DAG, true);
16723 case Intrinsic::aarch64_sve_sqsub:
16724 return convertMergedOpToPredOp(N, ISD::SSUBSAT, DAG, true);
16725 case Intrinsic::aarch64_sve_uqadd:
16726 return convertMergedOpToPredOp(N, ISD::UADDSAT, DAG, true);
16727 case Intrinsic::aarch64_sve_uqsub:
16728 return convertMergedOpToPredOp(N, ISD::USUBSAT, DAG, true);
16729 case Intrinsic::aarch64_sve_sqadd_x:
16730 return DAG.getNode(ISD::SADDSAT, SDLoc(N), N->getValueType(0),
16731 N->getOperand(1), N->getOperand(2));
16732 case Intrinsic::aarch64_sve_sqsub_x:
16733 return DAG.getNode(ISD::SSUBSAT, SDLoc(N), N->getValueType(0),
16734 N->getOperand(1), N->getOperand(2));
16735 case Intrinsic::aarch64_sve_uqadd_x:
16736 return DAG.getNode(ISD::UADDSAT, SDLoc(N), N->getValueType(0),
16737 N->getOperand(1), N->getOperand(2));
16738 case Intrinsic::aarch64_sve_uqsub_x:
16739 return DAG.getNode(ISD::USUBSAT, SDLoc(N), N->getValueType(0),
16740 N->getOperand(1), N->getOperand(2));
16741 case Intrinsic::aarch64_sve_asrd:
16742 return DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, SDLoc(N), N->getValueType(0),
16743 N->getOperand(1), N->getOperand(2), N->getOperand(3));
16744 case Intrinsic::aarch64_sve_cmphs:
16745 if (!N->getOperand(2).getValueType().isFloatingPoint())
16746 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
16747 N->getValueType(0), N->getOperand(1), N->getOperand(2),
16748 N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
16750 case Intrinsic::aarch64_sve_cmphi:
16751 if (!N->getOperand(2).getValueType().isFloatingPoint())
16752 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
16753 N->getValueType(0), N->getOperand(1), N->getOperand(2),
16754 N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
16756 case Intrinsic::aarch64_sve_fcmpge:
16757 case Intrinsic::aarch64_sve_cmpge:
16758 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
16759 N->getValueType(0), N->getOperand(1), N->getOperand(2),
16760 N->getOperand(3), DAG.getCondCode(ISD::SETGE));
16762 case Intrinsic::aarch64_sve_fcmpgt:
16763 case Intrinsic::aarch64_sve_cmpgt:
16764 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
16765 N->getValueType(0), N->getOperand(1), N->getOperand(2),
16766 N->getOperand(3), DAG.getCondCode(ISD::SETGT));
16768 case Intrinsic::aarch64_sve_fcmpeq:
16769 case Intrinsic::aarch64_sve_cmpeq:
16770 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
16771 N->getValueType(0), N->getOperand(1), N->getOperand(2),
16772 N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
16774 case Intrinsic::aarch64_sve_fcmpne:
16775 case Intrinsic::aarch64_sve_cmpne:
16776 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
16777 N->getValueType(0), N->getOperand(1), N->getOperand(2),
16778 N->getOperand(3), DAG.getCondCode(ISD::SETNE));
16780 case Intrinsic::aarch64_sve_fcmpuo:
16781 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
16782 N->getValueType(0), N->getOperand(1), N->getOperand(2),
16783 N->getOperand(3), DAG.getCondCode(ISD::SETUO));
16785 case Intrinsic::aarch64_sve_fadda:
16786 return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
16787 case Intrinsic::aarch64_sve_faddv:
16788 return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
16789 case Intrinsic::aarch64_sve_fmaxnmv:
16790 return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
16791 case Intrinsic::aarch64_sve_fmaxv:
16792 return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
16793 case Intrinsic::aarch64_sve_fminnmv:
16794 return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
16795 case Intrinsic::aarch64_sve_fminv:
16796 return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
16797 case Intrinsic::aarch64_sve_sel:
16798 return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
16799 N->getOperand(1), N->getOperand(2), N->getOperand(3));
16800 case Intrinsic::aarch64_sve_cmpeq_wide:
16801 return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
16802 case Intrinsic::aarch64_sve_cmpne_wide:
16803 return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
16804 case Intrinsic::aarch64_sve_cmpge_wide:
16805 return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
16806 case Intrinsic::aarch64_sve_cmpgt_wide:
16807 return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
16808 case Intrinsic::aarch64_sve_cmplt_wide:
16809 return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
16810 case Intrinsic::aarch64_sve_cmple_wide:
16811 return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
16812 case Intrinsic::aarch64_sve_cmphs_wide:
16813 return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
16814 case Intrinsic::aarch64_sve_cmphi_wide:
16815 return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
16816 case Intrinsic::aarch64_sve_cmplo_wide:
16817 return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
16818 case Intrinsic::aarch64_sve_cmpls_wide:
16819 return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
16820 case Intrinsic::aarch64_sve_ptest_any:
16821 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
16822 AArch64CC::ANY_ACTIVE);
16823 case Intrinsic::aarch64_sve_ptest_first:
16824 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
16825 AArch64CC::FIRST_ACTIVE);
16826 case Intrinsic::aarch64_sve_ptest_last:
16827 return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
16828 AArch64CC::LAST_ACTIVE);
16833 static bool isCheapToExtend(const SDValue &N) {
16834 unsigned OC = N->getOpcode();
16835 return OC == ISD::LOAD || OC == ISD::MLOAD ||
16836 ISD::isConstantSplatVectorAllZeros(N.getNode());
16840 performSignExtendSetCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
16841 SelectionDAG &DAG) {
16842 // If we have (sext (setcc A B)) and A and B are cheap to extend,
16843 // we can move the sext into the arguments and have the same result. For
16844 // example, if A and B are both loads, we can make those extending loads and
16845 // avoid an extra instruction. This pattern appears often in VLS code
16846 // generation where the inputs to the setcc have a different size to the
16847 // instruction that wants to use the result of the setcc.
16848 assert(N->getOpcode() == ISD::SIGN_EXTEND &&
16849 N->getOperand(0)->getOpcode() == ISD::SETCC);
16850 const SDValue SetCC = N->getOperand(0);
16852 const SDValue CCOp0 = SetCC.getOperand(0);
16853 const SDValue CCOp1 = SetCC.getOperand(1);
16854 if (!CCOp0->getValueType(0).isInteger() ||
16855 !CCOp1->getValueType(0).isInteger())
16858 ISD::CondCode Code =
16859 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get();
16861 ISD::NodeType ExtType =
16862 isSignedIntSetCC(Code) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
16864 if (isCheapToExtend(SetCC.getOperand(0)) &&
16865 isCheapToExtend(SetCC.getOperand(1))) {
16866 const SDValue Ext1 =
16867 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp0);
16868 const SDValue Ext2 =
16869 DAG.getNode(ExtType, SDLoc(N), N->getValueType(0), CCOp1);
16871 return DAG.getSetCC(
16872 SDLoc(SetCC), N->getValueType(0), Ext1, Ext2,
16873 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get());
16879 static SDValue performExtendCombine(SDNode *N,
16880 TargetLowering::DAGCombinerInfo &DCI,
16881 SelectionDAG &DAG) {
16882 // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
16883 // we can convert that DUP into another extract_high (of a bigger DUP), which
16884 // helps the backend to decide that an sabdl2 would be useful, saving a real
16885 // extract_high operation.
16886 if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
16887 (N->getOperand(0).getOpcode() == ISD::ABDU ||
16888 N->getOperand(0).getOpcode() == ISD::ABDS)) {
16889 SDNode *ABDNode = N->getOperand(0).getNode();
16891 tryCombineLongOpWithDup(Intrinsic::not_intrinsic, ABDNode, DCI, DAG);
16892 if (!NewABD.getNode())
16895 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), NewABD);
16898 if (N->getValueType(0).isFixedLengthVector() &&
16899 N->getOpcode() == ISD::SIGN_EXTEND &&
16900 N->getOperand(0)->getOpcode() == ISD::SETCC)
16901 return performSignExtendSetCCCombine(N, DCI, DAG);
16906 static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
16907 SDValue SplatVal, unsigned NumVecElts) {
16908 assert(!St.isTruncatingStore() && "cannot split truncating vector store");
16909 Align OrigAlignment = St.getAlign();
16910 unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;
16912 // Create scalar stores. This is at least as good as the code sequence for a
16913 // split unaligned store which is a dup.s, ext.b, and two stores.
16914 // Most of the time the three stores should be replaced by store pair
16915 // instructions (stp).
16917 SDValue BasePtr = St.getBasePtr();
16918 uint64_t BaseOffset = 0;
16920 const MachinePointerInfo &PtrInfo = St.getPointerInfo();
16922 DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
16923 OrigAlignment, St.getMemOperand()->getFlags());
16925 // As this in ISel, we will not merge this add which may degrade results.
16926 if (BasePtr->getOpcode() == ISD::ADD &&
16927 isa<ConstantSDNode>(BasePtr->getOperand(1))) {
16928 BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
16929 BasePtr = BasePtr->getOperand(0);
16932 unsigned Offset = EltOffset;
16933 while (--NumVecElts) {
16934 Align Alignment = commonAlignment(OrigAlignment, Offset);
16935 SDValue OffsetPtr =
16936 DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
16937 DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
16938 NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
16939 PtrInfo.getWithOffset(Offset), Alignment,
16940 St.getMemOperand()->getFlags());
16941 Offset += EltOffset;
16946 // Returns an SVE type that ContentTy can be trivially sign or zero extended
16948 static MVT getSVEContainerType(EVT ContentTy) {
16949 assert(ContentTy.isSimple() && "No SVE containers for extended types");
16951 switch (ContentTy.getSimpleVT().SimpleTy) {
16953 llvm_unreachable("No known SVE container for this MVT type");
16960 return MVT::nxv2i64;
16965 return MVT::nxv4i32;
16969 case MVT::nxv8bf16:
16970 return MVT::nxv8i16;
16972 return MVT::nxv16i8;
16976 static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
16978 EVT VT = N->getValueType(0);
16980 if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
16983 EVT ContainerVT = VT;
16984 if (ContainerVT.isInteger())
16985 ContainerVT = getSVEContainerType(ContainerVT);
16987 SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
16988 SDValue Ops[] = { N->getOperand(0), // Chain
16989 N->getOperand(2), // Pg
16990 N->getOperand(3), // Base
16991 DAG.getValueType(VT) };
16993 SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
16994 SDValue LoadChain = SDValue(Load.getNode(), 1);
16996 if (ContainerVT.isInteger() && (VT != ContainerVT))
16997 Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
16999 return DAG.getMergeValues({ Load, LoadChain }, DL);
17002 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
17004 EVT VT = N->getValueType(0);
17005 EVT PtrTy = N->getOperand(3).getValueType();
17008 if (VT.isFloatingPoint())
17009 LoadVT = VT.changeTypeToInteger();
17011 auto *MINode = cast<MemIntrinsicSDNode>(N);
17012 SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
17013 SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
17014 MINode->getOperand(3), DAG.getUNDEF(PtrTy),
17015 MINode->getOperand(2), PassThru,
17016 MINode->getMemoryVT(), MINode->getMemOperand(),
17017 ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
17019 if (VT.isFloatingPoint()) {
17020 SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
17021 return DAG.getMergeValues(Ops, DL);
17027 template <unsigned Opcode>
17028 static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
17029 static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
17030 Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
17031 "Unsupported opcode.");
17033 EVT VT = N->getValueType(0);
17036 if (VT.isFloatingPoint())
17037 LoadVT = VT.changeTypeToInteger();
17039 SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
17040 SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
17041 SDValue LoadChain = SDValue(Load.getNode(), 1);
17043 if (VT.isFloatingPoint())
17044 Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
17046 return DAG.getMergeValues({Load, LoadChain}, DL);
17049 static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
17051 SDValue Data = N->getOperand(2);
17052 EVT DataVT = Data.getValueType();
17053 EVT HwSrcVt = getSVEContainerType(DataVT);
17054 SDValue InputVT = DAG.getValueType(DataVT);
17056 if (DataVT.isFloatingPoint())
17057 InputVT = DAG.getValueType(HwSrcVt);
17060 if (Data.getValueType().isFloatingPoint())
17061 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
17063 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
17065 SDValue Ops[] = { N->getOperand(0), // Chain
17067 N->getOperand(4), // Base
17068 N->getOperand(3), // Pg
17072 return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
17075 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
17078 SDValue Data = N->getOperand(2);
17079 EVT DataVT = Data.getValueType();
17080 EVT PtrTy = N->getOperand(4).getValueType();
17082 if (DataVT.isFloatingPoint())
17083 Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
17085 auto *MINode = cast<MemIntrinsicSDNode>(N);
17086 return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
17087 DAG.getUNDEF(PtrTy), MINode->getOperand(3),
17088 MINode->getMemoryVT(), MINode->getMemOperand(),
17089 ISD::UNINDEXED, false, false);
17092 /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
17093 /// load store optimizer pass will merge them to store pair stores. This should
17094 /// be better than a movi to create the vector zero followed by a vector store
17095 /// if the zero constant is not re-used, since one instructions and one register
17096 /// live range will be removed.
17098 /// For example, the final generated code should be:
17100 /// stp xzr, xzr, [x0]
17107 static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
17108 SDValue StVal = St.getValue();
17109 EVT VT = StVal.getValueType();
17111 // Avoid scalarizing zero splat stores for scalable vectors.
17112 if (VT.isScalableVector())
17115 // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
17116 // 2, 3 or 4 i32 elements.
17117 int NumVecElts = VT.getVectorNumElements();
17118 if (!(((NumVecElts == 2 || NumVecElts == 3) &&
17119 VT.getVectorElementType().getSizeInBits() == 64) ||
17120 ((NumVecElts == 2 || NumVecElts == 3 || NumVecElts == 4) &&
17121 VT.getVectorElementType().getSizeInBits() == 32)))
17124 if (StVal.getOpcode() != ISD::BUILD_VECTOR)
17127 // If the zero constant has more than one use then the vector store could be
17128 // better since the constant mov will be amortized and stp q instructions
17129 // should be able to be formed.
17130 if (!StVal.hasOneUse())
17133 // If the store is truncating then it's going down to i16 or smaller, which
17134 // means it can be implemented in a single store anyway.
17135 if (St.isTruncatingStore())
17138 // If the immediate offset of the address operand is too large for the stp
17139 // instruction, then bail out.
17140 if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
17141 int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
17142 if (Offset < -512 || Offset > 504)
17146 for (int I = 0; I < NumVecElts; ++I) {
17147 SDValue EltVal = StVal.getOperand(I);
17148 if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
17152 // Use a CopyFromReg WZR/XZR here to prevent
17153 // DAGCombiner::MergeConsecutiveStores from undoing this transformation.
17157 if (VT.getVectorElementType().getSizeInBits() == 32) {
17158 ZeroReg = AArch64::WZR;
17161 ZeroReg = AArch64::XZR;
17165 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ZeroReg, ZeroVT);
17166 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
17169 /// Replace a splat of a scalar to a vector store by scalar stores of the scalar
17170 /// value. The load store optimizer pass will merge them to store pair stores.
17171 /// This has better performance than a splat of the scalar followed by a split
17172 /// vector store. Even if the stores are not merged it is four stores vs a dup,
17173 /// followed by an ext.b and two stores.
17174 static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
17175 SDValue StVal = St.getValue();
17176 EVT VT = StVal.getValueType();
17178 // Don't replace floating point stores, they possibly won't be transformed to
17179 // stp because of the store pair suppress pass.
17180 if (VT.isFloatingPoint())
17183 // We can express a splat as store pair(s) for 2 or 4 elements.
17184 unsigned NumVecElts = VT.getVectorNumElements();
17185 if (NumVecElts != 4 && NumVecElts != 2)
17188 // If the store is truncating then it's going down to i16 or smaller, which
17189 // means it can be implemented in a single store anyway.
17190 if (St.isTruncatingStore())
17193 // Check that this is a splat.
17194 // Make sure that each of the relevant vector element locations are inserted
17195 // to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
17196 std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
17198 for (unsigned I = 0; I < NumVecElts; ++I) {
17199 // Check for insert vector elements.
17200 if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
17203 // Check that same value is inserted at each vector element.
17205 SplatVal = StVal.getOperand(1);
17206 else if (StVal.getOperand(1) != SplatVal)
17209 // Check insert element index.
17210 ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
17213 uint64_t IndexVal = CIndex->getZExtValue();
17214 if (IndexVal >= NumVecElts)
17216 IndexNotInserted.reset(IndexVal);
17218 StVal = StVal.getOperand(0);
17220 // Check that all vector element locations were inserted to.
17221 if (IndexNotInserted.any())
17224 return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
17227 static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
17229 const AArch64Subtarget *Subtarget) {
17231 StoreSDNode *S = cast<StoreSDNode>(N);
17232 if (S->isVolatile() || S->isIndexed())
17235 SDValue StVal = S->getValue();
17236 EVT VT = StVal.getValueType();
17238 if (!VT.isFixedLengthVector())
17241 // If we get a splat of zeros, convert this vector store to a store of
17242 // scalars. They will be merged into store pairs of xzr thereby removing one
17243 // instruction and one register.
17244 if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
17245 return ReplacedZeroSplat;
17247 // FIXME: The logic for deciding if an unaligned store should be split should
17248 // be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
17249 // a call to that function here.
17251 if (!Subtarget->isMisaligned128StoreSlow())
17254 // Don't split at -Oz.
17255 if (DAG.getMachineFunction().getFunction().hasMinSize())
17258 // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
17259 // those up regresses performance on micro-benchmarks and olden/bh.
17260 if (VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
17263 // Split unaligned 16B stores. They are terrible for performance.
17264 // Don't split stores with alignment of 1 or 2. Code that uses clang vector
17265 // extensions can use this to mark that it does not want splitting to happen
17266 // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
17267 // eliminating alignment hazards is only 1 in 8 for alignment of 2.
17268 if (VT.getSizeInBits() != 128 || S->getAlign() >= Align(16) ||
17269 S->getAlign() <= Align(2))
17272 // If we get a splat of a scalar convert this vector store to a store of
17273 // scalars. They will be merged into store pairs thereby removing two
17275 if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
17276 return ReplacedSplat;
17280 // Split VT into two.
17281 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
17282 unsigned NumElts = HalfVT.getVectorNumElements();
17283 SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
17284 DAG.getConstant(0, DL, MVT::i64));
17285 SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
17286 DAG.getConstant(NumElts, DL, MVT::i64));
17287 SDValue BasePtr = S->getBasePtr();
17289 DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
17290 S->getAlign(), S->getMemOperand()->getFlags());
17291 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
17292 DAG.getConstant(8, DL, MVT::i64));
17293 return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
17294 S->getPointerInfo(), S->getAlign(),
17295 S->getMemOperand()->getFlags());
17298 static SDValue performSpliceCombine(SDNode *N, SelectionDAG &DAG) {
17299 assert(N->getOpcode() == AArch64ISD::SPLICE && "Unexepected Opcode!");
17301 // splice(pg, op1, undef) -> op1
17302 if (N->getOperand(2).isUndef())
17303 return N->getOperand(1);
17308 static SDValue performUnpackCombine(SDNode *N, SelectionDAG &DAG,
17309 const AArch64Subtarget *Subtarget) {
17310 assert((N->getOpcode() == AArch64ISD::UUNPKHI ||
17311 N->getOpcode() == AArch64ISD::UUNPKLO) &&
17312 "Unexpected Opcode!");
17314 // uunpklo/hi undef -> undef
17315 if (N->getOperand(0).isUndef())
17316 return DAG.getUNDEF(N->getValueType(0));
17318 // If this is a masked load followed by an UUNPKLO, fold this into a masked
17319 // extending load. We can do this even if this is already a masked
17321 if (N->getOperand(0).getOpcode() == ISD::MLOAD &&
17322 N->getOpcode() == AArch64ISD::UUNPKLO) {
17323 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N->getOperand(0));
17324 SDValue Mask = MLD->getMask();
17327 if (MLD->isUnindexed() && MLD->getExtensionType() != ISD::SEXTLOAD &&
17328 SDValue(MLD, 0).hasOneUse() && Mask->getOpcode() == AArch64ISD::PTRUE &&
17329 (MLD->getPassThru()->isUndef() ||
17330 isZerosVector(MLD->getPassThru().getNode()))) {
17331 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
17332 unsigned PgPattern = Mask->getConstantOperandVal(0);
17333 EVT VT = N->getValueType(0);
17335 // Ensure we can double the size of the predicate pattern
17336 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
17338 NumElts * VT.getVectorElementType().getSizeInBits() <= MinSVESize) {
17340 getPTrue(DAG, DL, VT.changeVectorElementType(MVT::i1), PgPattern);
17341 SDValue PassThru = DAG.getConstant(0, DL, VT);
17342 SDValue NewLoad = DAG.getMaskedLoad(
17343 VT, DL, MLD->getChain(), MLD->getBasePtr(), MLD->getOffset(), Mask,
17344 PassThru, MLD->getMemoryVT(), MLD->getMemOperand(),
17345 MLD->getAddressingMode(), ISD::ZEXTLOAD);
17347 DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), NewLoad.getValue(1));
17357 static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
17359 SDValue Op0 = N->getOperand(0);
17360 SDValue Op1 = N->getOperand(1);
17361 EVT ResVT = N->getValueType(0);
17363 // uzp1(x, undef) -> concat(truncate(x), undef)
17364 if (Op1.getOpcode() == ISD::UNDEF) {
17365 EVT BCVT = MVT::Other, HalfVT = MVT::Other;
17366 switch (ResVT.getSimpleVT().SimpleTy) {
17371 HalfVT = MVT::v8i8;
17375 HalfVT = MVT::v4i16;
17379 HalfVT = MVT::v2i32;
17382 if (BCVT != MVT::Other) {
17383 SDValue BC = DAG.getBitcast(BCVT, Op0);
17384 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, BC);
17385 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Trunc,
17386 DAG.getUNDEF(HalfVT));
17390 // uzp1(unpklo(uzp1(x, y)), z) => uzp1(x, z)
17391 if (Op0.getOpcode() == AArch64ISD::UUNPKLO) {
17392 if (Op0.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
17393 SDValue X = Op0.getOperand(0).getOperand(0);
17394 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, X, Op1);
17398 // uzp1(x, unpkhi(uzp1(y, z))) => uzp1(x, z)
17399 if (Op1.getOpcode() == AArch64ISD::UUNPKHI) {
17400 if (Op1.getOperand(0).getOpcode() == AArch64ISD::UZP1) {
17401 SDValue Z = Op1.getOperand(0).getOperand(1);
17402 return DAG.getNode(AArch64ISD::UZP1, DL, ResVT, Op0, Z);
17409 static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {
17410 unsigned Opc = N->getOpcode();
17412 assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads
17413 Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) ||
17414 (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads
17415 Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) &&
17416 "Invalid opcode.");
17418 const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO ||
17419 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
17420 const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO ||
17421 Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
17422 const bool Extended = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO ||
17423 Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO ||
17424 Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO ||
17425 Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO;
17428 SDValue Chain = N->getOperand(0);
17429 SDValue Pg = N->getOperand(1);
17430 SDValue Base = N->getOperand(2);
17431 SDValue Offset = N->getOperand(3);
17432 SDValue Ty = N->getOperand(4);
17434 EVT ResVT = N->getValueType(0);
17436 const auto OffsetOpc = Offset.getOpcode();
17437 const bool OffsetIsZExt =
17438 OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU;
17439 const bool OffsetIsSExt =
17440 OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU;
17442 // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible.
17443 if (!Extended && (OffsetIsSExt || OffsetIsZExt)) {
17444 SDValue ExtPg = Offset.getOperand(0);
17445 VTSDNode *ExtFrom = cast<VTSDNode>(Offset.getOperand(2).getNode());
17446 EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType();
17448 // If the predicate for the sign- or zero-extended offset is the
17449 // same as the predicate used for this load and the sign-/zero-extension
17450 // was from a 32-bits...
17451 if (ExtPg == Pg && ExtFromEVT == MVT::i32) {
17452 SDValue UnextendedOffset = Offset.getOperand(1);
17454 unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true);
17456 NewOpc = getSignExtendedGatherOpcode(NewOpc);
17458 return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other},
17459 {Chain, Pg, Base, UnextendedOffset, Ty});
17466 /// Optimize a vector shift instruction and its operand if shifted out
17467 /// bits are not used.
17468 static SDValue performVectorShiftCombine(SDNode *N,
17469 const AArch64TargetLowering &TLI,
17470 TargetLowering::DAGCombinerInfo &DCI) {
17471 assert(N->getOpcode() == AArch64ISD::VASHR ||
17472 N->getOpcode() == AArch64ISD::VLSHR);
17474 SDValue Op = N->getOperand(0);
17475 unsigned OpScalarSize = Op.getScalarValueSizeInBits();
17477 unsigned ShiftImm = N->getConstantOperandVal(1);
17478 assert(OpScalarSize > ShiftImm && "Invalid shift imm");
17480 APInt ShiftedOutBits = APInt::getLowBitsSet(OpScalarSize, ShiftImm);
17481 APInt DemandedMask = ~ShiftedOutBits;
17483 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
17484 return SDValue(N, 0);
17489 static SDValue performSunpkloCombine(SDNode *N, SelectionDAG &DAG) {
17490 // sunpklo(sext(pred)) -> sext(extract_low_half(pred))
17491 // This transform works in partnership with performSetCCPunpkCombine to
17492 // remove unnecessary transfer of predicates into standard registers and back
17493 if (N->getOperand(0).getOpcode() == ISD::SIGN_EXTEND &&
17494 N->getOperand(0)->getOperand(0)->getValueType(0).getScalarType() ==
17496 SDValue CC = N->getOperand(0)->getOperand(0);
17497 auto VT = CC->getValueType(0).getHalfNumVectorElementsVT(*DAG.getContext());
17498 SDValue Unpk = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, CC,
17499 DAG.getVectorIdxConstant(0, SDLoc(N)));
17500 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), N->getValueType(0), Unpk);
17506 /// Target-specific DAG combine function for post-increment LD1 (lane) and
17507 /// post-increment LD1R.
17508 static SDValue performPostLD1Combine(SDNode *N,
17509 TargetLowering::DAGCombinerInfo &DCI,
17511 if (DCI.isBeforeLegalizeOps())
17514 SelectionDAG &DAG = DCI.DAG;
17515 EVT VT = N->getValueType(0);
17517 if (!VT.is128BitVector() && !VT.is64BitVector())
17520 unsigned LoadIdx = IsLaneOp ? 1 : 0;
17521 SDNode *LD = N->getOperand(LoadIdx).getNode();
17522 // If it is not LOAD, can not do such combine.
17523 if (LD->getOpcode() != ISD::LOAD)
17526 // The vector lane must be a constant in the LD1LANE opcode.
17529 Lane = N->getOperand(2);
17530 auto *LaneC = dyn_cast<ConstantSDNode>(Lane);
17531 if (!LaneC || LaneC->getZExtValue() >= VT.getVectorNumElements())
17535 LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
17536 EVT MemVT = LoadSDN->getMemoryVT();
17537 // Check if memory operand is the same type as the vector element.
17538 if (MemVT != VT.getVectorElementType())
17541 // Check if there are other uses. If so, do not combine as it will introduce
17543 for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
17545 if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
17551 SDValue Addr = LD->getOperand(1);
17552 SDValue Vector = N->getOperand(0);
17553 // Search for a use of the address operand that is an increment.
17554 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
17555 Addr.getNode()->use_end(); UI != UE; ++UI) {
17556 SDNode *User = *UI;
17557 if (User->getOpcode() != ISD::ADD
17558 || UI.getUse().getResNo() != Addr.getResNo())
17561 // If the increment is a constant, it must match the memory ref size.
17562 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
17563 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
17564 uint32_t IncVal = CInc->getZExtValue();
17565 unsigned NumBytes = VT.getScalarSizeInBits() / 8;
17566 if (IncVal != NumBytes)
17568 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
17571 // To avoid cycle construction make sure that neither the load nor the add
17572 // are predecessors to each other or the Vector.
17573 SmallPtrSet<const SDNode *, 32> Visited;
17574 SmallVector<const SDNode *, 16> Worklist;
17575 Visited.insert(Addr.getNode());
17576 Worklist.push_back(User);
17577 Worklist.push_back(LD);
17578 Worklist.push_back(Vector.getNode());
17579 if (SDNode::hasPredecessorHelper(LD, Visited, Worklist) ||
17580 SDNode::hasPredecessorHelper(User, Visited, Worklist))
17583 SmallVector<SDValue, 8> Ops;
17584 Ops.push_back(LD->getOperand(0)); // Chain
17586 Ops.push_back(Vector); // The vector to be inserted
17587 Ops.push_back(Lane); // The lane to be inserted in the vector
17589 Ops.push_back(Addr);
17590 Ops.push_back(Inc);
17592 EVT Tys[3] = { VT, MVT::i64, MVT::Other };
17593 SDVTList SDTys = DAG.getVTList(Tys);
17594 unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
17595 SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
17597 LoadSDN->getMemOperand());
17599 // Update the uses.
17600 SDValue NewResults[] = {
17601 SDValue(LD, 0), // The result of load
17602 SDValue(UpdN.getNode(), 2) // Chain
17604 DCI.CombineTo(LD, NewResults);
17605 DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
17606 DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register
17613 /// Simplify ``Addr`` given that the top byte of it is ignored by HW during
17614 /// address translation.
17615 static bool performTBISimplification(SDValue Addr,
17616 TargetLowering::DAGCombinerInfo &DCI,
17617 SelectionDAG &DAG) {
17618 APInt DemandedMask = APInt::getLowBitsSet(64, 56);
17620 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
17621 !DCI.isBeforeLegalizeOps());
17622 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17623 if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
17624 DCI.CommitTargetLoweringOpt(TLO);
17630 static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
17631 assert((N->getOpcode() == ISD::STORE || N->getOpcode() == ISD::MSTORE) &&
17632 "Expected STORE dag node in input!");
17634 if (auto Store = dyn_cast<StoreSDNode>(N)) {
17635 if (!Store->isTruncatingStore() || Store->isIndexed())
17637 SDValue Ext = Store->getValue();
17638 auto ExtOpCode = Ext.getOpcode();
17639 if (ExtOpCode != ISD::ZERO_EXTEND && ExtOpCode != ISD::SIGN_EXTEND &&
17640 ExtOpCode != ISD::ANY_EXTEND)
17642 SDValue Orig = Ext->getOperand(0);
17643 if (Store->getMemoryVT() != Orig.getValueType())
17645 return DAG.getStore(Store->getChain(), SDLoc(Store), Orig,
17646 Store->getBasePtr(), Store->getMemOperand());
17652 static SDValue performSTORECombine(SDNode *N,
17653 TargetLowering::DAGCombinerInfo &DCI,
17655 const AArch64Subtarget *Subtarget) {
17656 StoreSDNode *ST = cast<StoreSDNode>(N);
17657 SDValue Chain = ST->getChain();
17658 SDValue Value = ST->getValue();
17659 SDValue Ptr = ST->getBasePtr();
17661 // If this is an FP_ROUND followed by a store, fold this into a truncating
17662 // store. We can do this even if this is already a truncstore.
17663 // We purposefully don't care about legality of the nodes here as we know
17664 // they can be split down into something legal.
17665 if (DCI.isBeforeLegalizeOps() && Value.getOpcode() == ISD::FP_ROUND &&
17666 Value.getNode()->hasOneUse() && ST->isUnindexed() &&
17667 Subtarget->useSVEForFixedLengthVectors() &&
17668 Value.getValueType().isFixedLengthVector() &&
17669 Value.getValueType().getFixedSizeInBits() >=
17670 Subtarget->getMinSVEVectorSizeInBits())
17671 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
17672 ST->getMemoryVT(), ST->getMemOperand());
17674 if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
17677 if (Subtarget->supportsAddressTopByteIgnored() &&
17678 performTBISimplification(N->getOperand(2), DCI, DAG))
17679 return SDValue(N, 0);
17681 if (SDValue Store = foldTruncStoreOfExt(DAG, N))
17687 static SDValue performMSTORECombine(SDNode *N,
17688 TargetLowering::DAGCombinerInfo &DCI,
17690 const AArch64Subtarget *Subtarget) {
17691 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
17692 SDValue Value = MST->getValue();
17693 SDValue Mask = MST->getMask();
17696 // If this is a UZP1 followed by a masked store, fold this into a masked
17697 // truncating store. We can do this even if this is already a masked
17699 if (Value.getOpcode() == AArch64ISD::UZP1 && Value->hasOneUse() &&
17700 MST->isUnindexed() && Mask->getOpcode() == AArch64ISD::PTRUE &&
17701 Value.getValueType().isInteger()) {
17702 Value = Value.getOperand(0);
17703 if (Value.getOpcode() == ISD::BITCAST) {
17705 Value.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
17706 EVT InVT = Value.getOperand(0).getValueType();
17708 if (HalfVT.widenIntegerVectorElementType(*DAG.getContext()) == InVT) {
17709 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
17710 unsigned PgPattern = Mask->getConstantOperandVal(0);
17712 // Ensure we can double the size of the predicate pattern
17713 unsigned NumElts = getNumElementsFromSVEPredPattern(PgPattern);
17714 if (NumElts && NumElts * InVT.getVectorElementType().getSizeInBits() <=
17716 Mask = getPTrue(DAG, DL, InVT.changeVectorElementType(MVT::i1),
17718 return DAG.getMaskedStore(MST->getChain(), DL, Value.getOperand(0),
17719 MST->getBasePtr(), MST->getOffset(), Mask,
17720 MST->getMemoryVT(), MST->getMemOperand(),
17721 MST->getAddressingMode(),
17722 /*IsTruncating=*/true);
17731 /// \return true if part of the index was folded into the Base.
17732 static bool foldIndexIntoBase(SDValue &BasePtr, SDValue &Index, SDValue Scale,
17733 SDLoc DL, SelectionDAG &DAG) {
17734 // This function assumes a vector of i64 indices.
17735 EVT IndexVT = Index.getValueType();
17736 if (!IndexVT.isVector() || IndexVT.getVectorElementType() != MVT::i64)
17741 // Index = X + splat(Offset)
17743 // BasePtr = Ptr + Offset * scale.
17745 if (Index.getOpcode() == ISD::ADD) {
17746 if (auto Offset = DAG.getSplatValue(Index.getOperand(1))) {
17747 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
17748 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
17749 Index = Index.getOperand(0);
17756 // Index = (X + splat(Offset)) << splat(Shift)
17758 // BasePtr = Ptr + (Offset << Shift) * scale)
17759 // Index = X << splat(shift)
17760 if (Index.getOpcode() == ISD::SHL &&
17761 Index.getOperand(0).getOpcode() == ISD::ADD) {
17762 SDValue Add = Index.getOperand(0);
17763 SDValue ShiftOp = Index.getOperand(1);
17764 SDValue OffsetOp = Add.getOperand(1);
17765 if (auto Shift = DAG.getSplatValue(ShiftOp))
17766 if (auto Offset = DAG.getSplatValue(OffsetOp)) {
17767 Offset = DAG.getNode(ISD::SHL, DL, MVT::i64, Offset, Shift);
17768 Offset = DAG.getNode(ISD::MUL, DL, MVT::i64, Offset, Scale);
17769 BasePtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, Offset);
17770 Index = DAG.getNode(ISD::SHL, DL, Index.getValueType(),
17771 Add.getOperand(0), ShiftOp);
17779 // Analyse the specified address returning true if a more optimal addressing
17780 // mode is available. When returning true all parameters are updated to reflect
17781 // their recommended values.
17782 static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
17783 SDValue &BasePtr, SDValue &Index,
17784 SelectionDAG &DAG) {
17785 // Try to iteratively fold parts of the index into the base pointer to
17786 // simplify the index as much as possible.
17787 bool Changed = false;
17788 while (foldIndexIntoBase(BasePtr, Index, N->getScale(), SDLoc(N), DAG))
17791 // Only consider element types that are pointer sized as smaller types can
17792 // be easily promoted.
17793 EVT IndexVT = Index.getValueType();
17794 if (IndexVT.getVectorElementType() != MVT::i64 || IndexVT == MVT::nxv2i64)
17798 // Index = step(const)
17799 int64_t Stride = 0;
17800 if (Index.getOpcode() == ISD::STEP_VECTOR)
17801 Stride = cast<ConstantSDNode>(Index.getOperand(0))->getSExtValue();
17804 // Index = step(const) << shift(const)
17805 else if (Index.getOpcode() == ISD::SHL &&
17806 Index.getOperand(0).getOpcode() == ISD::STEP_VECTOR) {
17807 SDValue RHS = Index.getOperand(1);
17809 dyn_cast_or_null<ConstantSDNode>(DAG.getSplatValue(RHS))) {
17810 int64_t Step = (int64_t)Index.getOperand(0).getConstantOperandVal(1);
17811 Stride = Step << Shift->getZExtValue();
17815 // Return early because no supported pattern is found.
17819 if (Stride < std::numeric_limits<int32_t>::min() ||
17820 Stride > std::numeric_limits<int32_t>::max())
17823 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
17824 unsigned MaxVScale =
17825 Subtarget.getMaxSVEVectorSizeInBits() / AArch64::SVEBitsPerBlock;
17826 int64_t LastElementOffset =
17827 IndexVT.getVectorMinNumElements() * Stride * MaxVScale;
17829 if (LastElementOffset < std::numeric_limits<int32_t>::min() ||
17830 LastElementOffset > std::numeric_limits<int32_t>::max())
17833 EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
17834 // Stride does not scale explicitly by 'Scale', because it happens in
17835 // the gather/scatter addressing mode.
17836 Index = DAG.getNode(ISD::STEP_VECTOR, SDLoc(N), NewIndexVT,
17837 DAG.getTargetConstant(Stride, SDLoc(N), MVT::i32));
17841 static SDValue performMaskedGatherScatterCombine(
17842 SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) {
17843 MaskedGatherScatterSDNode *MGS = cast<MaskedGatherScatterSDNode>(N);
17844 assert(MGS && "Can only combine gather load or scatter store nodes");
17846 if (!DCI.isBeforeLegalize())
17850 SDValue Chain = MGS->getChain();
17851 SDValue Scale = MGS->getScale();
17852 SDValue Index = MGS->getIndex();
17853 SDValue Mask = MGS->getMask();
17854 SDValue BasePtr = MGS->getBasePtr();
17855 ISD::MemIndexType IndexType = MGS->getIndexType();
17857 if (!findMoreOptimalIndexType(MGS, BasePtr, Index, DAG))
17860 // Here we catch such cases early and change MGATHER's IndexType to allow
17861 // the use of an Index that's more legalisation friendly.
17862 if (auto *MGT = dyn_cast<MaskedGatherSDNode>(MGS)) {
17863 SDValue PassThru = MGT->getPassThru();
17864 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
17865 return DAG.getMaskedGather(
17866 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
17867 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
17869 auto *MSC = cast<MaskedScatterSDNode>(MGS);
17870 SDValue Data = MSC->getValue();
17871 SDValue Ops[] = {Chain, Data, Mask, BasePtr, Index, Scale};
17872 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(), DL,
17873 Ops, MSC->getMemOperand(), IndexType,
17874 MSC->isTruncatingStore());
17877 /// Target-specific DAG combine function for NEON load/store intrinsics
17878 /// to merge base address updates.
17879 static SDValue performNEONPostLDSTCombine(SDNode *N,
17880 TargetLowering::DAGCombinerInfo &DCI,
17881 SelectionDAG &DAG) {
17882 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17885 unsigned AddrOpIdx = N->getNumOperands() - 1;
17886 SDValue Addr = N->getOperand(AddrOpIdx);
17888 // Search for a use of the address operand that is an increment.
17889 for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
17890 UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
17891 SDNode *User = *UI;
17892 if (User->getOpcode() != ISD::ADD ||
17893 UI.getUse().getResNo() != Addr.getResNo())
17896 // Check that the add is independent of the load/store. Otherwise, folding
17897 // it would create a cycle.
17898 SmallPtrSet<const SDNode *, 32> Visited;
17899 SmallVector<const SDNode *, 16> Worklist;
17900 Visited.insert(Addr.getNode());
17901 Worklist.push_back(N);
17902 Worklist.push_back(User);
17903 if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
17904 SDNode::hasPredecessorHelper(User, Visited, Worklist))
17907 // Find the new opcode for the updating load/store.
17908 bool IsStore = false;
17909 bool IsLaneOp = false;
17910 bool IsDupOp = false;
17911 unsigned NewOpc = 0;
17912 unsigned NumVecs = 0;
17913 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
17915 default: llvm_unreachable("unexpected intrinsic for Neon base update");
17916 case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
17917 NumVecs = 2; break;
17918 case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
17919 NumVecs = 3; break;
17920 case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
17921 NumVecs = 4; break;
17922 case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
17923 NumVecs = 2; IsStore = true; break;
17924 case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
17925 NumVecs = 3; IsStore = true; break;
17926 case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
17927 NumVecs = 4; IsStore = true; break;
17928 case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
17929 NumVecs = 2; break;
17930 case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
17931 NumVecs = 3; break;
17932 case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
17933 NumVecs = 4; break;
17934 case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
17935 NumVecs = 2; IsStore = true; break;
17936 case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
17937 NumVecs = 3; IsStore = true; break;
17938 case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
17939 NumVecs = 4; IsStore = true; break;
17940 case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
17941 NumVecs = 2; IsDupOp = true; break;
17942 case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
17943 NumVecs = 3; IsDupOp = true; break;
17944 case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
17945 NumVecs = 4; IsDupOp = true; break;
17946 case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
17947 NumVecs = 2; IsLaneOp = true; break;
17948 case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
17949 NumVecs = 3; IsLaneOp = true; break;
17950 case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
17951 NumVecs = 4; IsLaneOp = true; break;
17952 case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
17953 NumVecs = 2; IsStore = true; IsLaneOp = true; break;
17954 case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
17955 NumVecs = 3; IsStore = true; IsLaneOp = true; break;
17956 case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
17957 NumVecs = 4; IsStore = true; IsLaneOp = true; break;
17962 VecTy = N->getOperand(2).getValueType();
17964 VecTy = N->getValueType(0);
17966 // If the increment is a constant, it must match the memory ref size.
17967 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
17968 if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
17969 uint32_t IncVal = CInc->getZExtValue();
17970 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
17971 if (IsLaneOp || IsDupOp)
17972 NumBytes /= VecTy.getVectorNumElements();
17973 if (IncVal != NumBytes)
17975 Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
17977 SmallVector<SDValue, 8> Ops;
17978 Ops.push_back(N->getOperand(0)); // Incoming chain
17979 // Load lane and store have vector list as input.
17980 if (IsLaneOp || IsStore)
17981 for (unsigned i = 2; i < AddrOpIdx; ++i)
17982 Ops.push_back(N->getOperand(i));
17983 Ops.push_back(Addr); // Base register
17984 Ops.push_back(Inc);
17988 unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
17990 for (n = 0; n < NumResultVecs; ++n)
17992 Tys[n++] = MVT::i64; // Type of write back register
17993 Tys[n] = MVT::Other; // Type of the chain
17994 SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
17996 MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
17997 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
17998 MemInt->getMemoryVT(),
17999 MemInt->getMemOperand());
18001 // Update the uses.
18002 std::vector<SDValue> NewResults;
18003 for (unsigned i = 0; i < NumResultVecs; ++i) {
18004 NewResults.push_back(SDValue(UpdN.getNode(), i));
18006 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
18007 DCI.CombineTo(N, NewResults);
18008 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
18015 // Checks to see if the value is the prescribed width and returns information
18016 // about its extension mode.
18018 bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
18019 ExtType = ISD::NON_EXTLOAD;
18020 switch(V.getNode()->getOpcode()) {
18024 LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
18025 if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
18026 || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
18027 ExtType = LoadNode->getExtensionType();
18032 case ISD::AssertSext: {
18033 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
18034 if ((TypeNode->getVT() == MVT::i8 && width == 8)
18035 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
18036 ExtType = ISD::SEXTLOAD;
18041 case ISD::AssertZext: {
18042 VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
18043 if ((TypeNode->getVT() == MVT::i8 && width == 8)
18044 || (TypeNode->getVT() == MVT::i16 && width == 16)) {
18045 ExtType = ISD::ZEXTLOAD;
18050 case ISD::Constant:
18051 case ISD::TargetConstant: {
18052 return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
18053 1LL << (width - 1);
18060 // This function does a whole lot of voodoo to determine if the tests are
18061 // equivalent without and with a mask. Essentially what happens is that given a
18064 // +-------------+ +-------------+ +-------------+ +-------------+
18065 // | Input | | AddConstant | | CompConstant| | CC |
18066 // +-------------+ +-------------+ +-------------+ +-------------+
18068 // V V | +----------+
18069 // +-------------+ +----+ | |
18070 // | ADD | |0xff| | |
18071 // +-------------+ +----+ | |
18074 // +-------------+ | |
18076 // +-------------+ | |
18085 // The AND node may be safely removed for some combinations of inputs. In
18086 // particular we need to take into account the extension type of the Input,
18087 // the exact values of AddConstant, CompConstant, and CC, along with the nominal
18088 // width of the input (this can work for any width inputs, the above graph is
18089 // specific to 8 bits.
18091 // The specific equations were worked out by generating output tables for each
18092 // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
18093 // problem was simplified by working with 4 bit inputs, which means we only
18094 // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
18095 // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
18096 // patterns present in both extensions (0,7). For every distinct set of
18097 // AddConstant and CompConstants bit patterns we can consider the masked and
18098 // unmasked versions to be equivalent if the result of this function is true for
18099 // all 16 distinct bit patterns of for the current extension type of Input (w0).
18102 // and w10, w8, #0x0f
18104 // cset w9, AArch64CC
18106 // cset w11, AArch64CC
18111 // Since the above function shows when the outputs are equivalent it defines
18112 // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
18113 // would be expensive to run during compiles. The equations below were written
18114 // in a test harness that confirmed they gave equivalent outputs to the above
18115 // for all inputs function, so they can be used determine if the removal is
18118 // isEquivalentMaskless() is the code for testing if the AND can be removed
18119 // factored out of the DAG recognition as the DAG can take several forms.
18121 static bool isEquivalentMaskless(unsigned CC, unsigned width,
18122 ISD::LoadExtType ExtType, int AddConstant,
18123 int CompConstant) {
18124 // By being careful about our equations and only writing the in term
18125 // symbolic values and well known constants (0, 1, -1, MaxUInt) we can
18126 // make them generally applicable to all bit widths.
18127 int MaxUInt = (1 << width);
18129 // For the purposes of these comparisons sign extending the type is
18130 // equivalent to zero extending the add and displacing it by half the integer
18131 // width. Provided we are careful and make sure our equations are valid over
18132 // the whole range we can just adjust the input and avoid writing equations
18133 // for sign extended inputs.
18134 if (ExtType == ISD::SEXTLOAD)
18135 AddConstant -= (1 << (width-1));
18138 case AArch64CC::LE:
18139 case AArch64CC::GT:
18140 if ((AddConstant == 0) ||
18141 (CompConstant == MaxUInt - 1 && AddConstant < 0) ||
18142 (AddConstant >= 0 && CompConstant < 0) ||
18143 (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
18146 case AArch64CC::LT:
18147 case AArch64CC::GE:
18148 if ((AddConstant == 0) ||
18149 (AddConstant >= 0 && CompConstant <= 0) ||
18150 (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
18153 case AArch64CC::HI:
18154 case AArch64CC::LS:
18155 if ((AddConstant >= 0 && CompConstant < 0) ||
18156 (AddConstant <= 0 && CompConstant >= -1 &&
18157 CompConstant < AddConstant + MaxUInt))
18160 case AArch64CC::PL:
18161 case AArch64CC::MI:
18162 if ((AddConstant == 0) ||
18163 (AddConstant > 0 && CompConstant <= 0) ||
18164 (AddConstant < 0 && CompConstant <= AddConstant))
18167 case AArch64CC::LO:
18168 case AArch64CC::HS:
18169 if ((AddConstant >= 0 && CompConstant <= 0) ||
18170 (AddConstant <= 0 && CompConstant >= 0 &&
18171 CompConstant <= AddConstant + MaxUInt))
18174 case AArch64CC::EQ:
18175 case AArch64CC::NE:
18176 if ((AddConstant > 0 && CompConstant < 0) ||
18177 (AddConstant < 0 && CompConstant >= 0 &&
18178 CompConstant < AddConstant + MaxUInt) ||
18179 (AddConstant >= 0 && CompConstant >= 0 &&
18180 CompConstant >= AddConstant) ||
18181 (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
18184 case AArch64CC::VS:
18185 case AArch64CC::VC:
18186 case AArch64CC::AL:
18187 case AArch64CC::NV:
18189 case AArch64CC::Invalid:
18197 SDValue performCONDCombine(SDNode *N,
18198 TargetLowering::DAGCombinerInfo &DCI,
18199 SelectionDAG &DAG, unsigned CCIndex,
18200 unsigned CmpIndex) {
18201 unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
18202 SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
18203 unsigned CondOpcode = SubsNode->getOpcode();
18205 if (CondOpcode != AArch64ISD::SUBS)
18208 // There is a SUBS feeding this condition. Is it fed by a mask we can
18211 SDNode *AndNode = SubsNode->getOperand(0).getNode();
18212 unsigned MaskBits = 0;
18214 if (AndNode->getOpcode() != ISD::AND)
18217 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
18218 uint32_t CNV = CN->getZExtValue();
18221 else if (CNV == 65535)
18228 SDValue AddValue = AndNode->getOperand(0);
18230 if (AddValue.getOpcode() != ISD::ADD)
18233 // The basic dag structure is correct, grab the inputs and validate them.
18235 SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
18236 SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
18237 SDValue SubsInputValue = SubsNode->getOperand(1);
18239 // The mask is present and the provenance of all the values is a smaller type,
18240 // lets see if the mask is superfluous.
18242 if (!isa<ConstantSDNode>(AddInputValue2.getNode()) ||
18243 !isa<ConstantSDNode>(SubsInputValue.getNode()))
18246 ISD::LoadExtType ExtType;
18248 if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) ||
18249 !checkValueWidth(AddInputValue2, MaskBits, ExtType) ||
18250 !checkValueWidth(AddInputValue1, MaskBits, ExtType) )
18253 if(!isEquivalentMaskless(CC, MaskBits, ExtType,
18254 cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
18255 cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
18258 // The AND is not necessary, remove it.
18260 SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
18261 SubsNode->getValueType(1));
18262 SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };
18264 SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
18265 DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());
18267 return SDValue(N, 0);
18270 // Optimize compare with zero and branch.
18271 static SDValue performBRCONDCombine(SDNode *N,
18272 TargetLowering::DAGCombinerInfo &DCI,
18273 SelectionDAG &DAG) {
18274 MachineFunction &MF = DAG.getMachineFunction();
18275 // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z instructions
18276 // will not be produced, as they are conditional branch instructions that do
18278 if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
18281 if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
18283 SDValue Chain = N->getOperand(0);
18284 SDValue Dest = N->getOperand(1);
18285 SDValue CCVal = N->getOperand(2);
18286 SDValue Cmp = N->getOperand(3);
18288 assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
18289 unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
18290 if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
18293 unsigned CmpOpc = Cmp.getOpcode();
18294 if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
18297 // Only attempt folding if there is only one use of the flag and no use of the
18299 if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
18302 SDValue LHS = Cmp.getOperand(0);
18303 SDValue RHS = Cmp.getOperand(1);
18305 assert(LHS.getValueType() == RHS.getValueType() &&
18306 "Expected the value type to be the same for both operands!");
18307 if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
18310 if (isNullConstant(LHS))
18311 std::swap(LHS, RHS);
18313 if (!isNullConstant(RHS))
18316 if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
18317 LHS.getOpcode() == ISD::SRL)
18320 // Fold the compare into the branch instruction.
18322 if (CC == AArch64CC::EQ)
18323 BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
18325 BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
18327 // Do not add new nodes to DAG combiner worklist.
18328 DCI.CombineTo(N, BR, false);
18333 static SDValue foldCSELofCTTZ(SDNode *N, SelectionDAG &DAG) {
18334 unsigned CC = N->getConstantOperandVal(2);
18335 SDValue SUBS = N->getOperand(3);
18336 SDValue Zero, CTTZ;
18338 if (CC == AArch64CC::EQ && SUBS.getOpcode() == AArch64ISD::SUBS) {
18339 Zero = N->getOperand(0);
18340 CTTZ = N->getOperand(1);
18341 } else if (CC == AArch64CC::NE && SUBS.getOpcode() == AArch64ISD::SUBS) {
18342 Zero = N->getOperand(1);
18343 CTTZ = N->getOperand(0);
18347 if ((CTTZ.getOpcode() != ISD::CTTZ && CTTZ.getOpcode() != ISD::TRUNCATE) ||
18348 (CTTZ.getOpcode() == ISD::TRUNCATE &&
18349 CTTZ.getOperand(0).getOpcode() != ISD::CTTZ))
18352 assert((CTTZ.getValueType() == MVT::i32 || CTTZ.getValueType() == MVT::i64) &&
18353 "Illegal type in CTTZ folding");
18355 if (!isNullConstant(Zero) || !isNullConstant(SUBS.getOperand(1)))
18358 SDValue X = CTTZ.getOpcode() == ISD::TRUNCATE
18359 ? CTTZ.getOperand(0).getOperand(0)
18360 : CTTZ.getOperand(0);
18362 if (X != SUBS.getOperand(0))
18365 unsigned BitWidth = CTTZ.getOpcode() == ISD::TRUNCATE
18366 ? CTTZ.getOperand(0).getValueSizeInBits()
18367 : CTTZ.getValueSizeInBits();
18368 SDValue BitWidthMinusOne =
18369 DAG.getConstant(BitWidth - 1, SDLoc(N), CTTZ.getValueType());
18370 return DAG.getNode(ISD::AND, SDLoc(N), CTTZ.getValueType(), CTTZ,
18374 // Optimize CSEL instructions
18375 static SDValue performCSELCombine(SDNode *N,
18376 TargetLowering::DAGCombinerInfo &DCI,
18377 SelectionDAG &DAG) {
18378 // CSEL x, x, cc -> x
18379 if (N->getOperand(0) == N->getOperand(1))
18380 return N->getOperand(0);
18382 // CSEL 0, cttz(X), eq(X, 0) -> AND cttz bitwidth-1
18383 // CSEL cttz(X), 0, ne(X, 0) -> AND cttz bitwidth-1
18384 if (SDValue Folded = foldCSELofCTTZ(N, DAG))
18387 return performCONDCombine(N, DCI, DAG, 2, 3);
18390 // Try to re-use an already extended operand of a vector SetCC feeding a
18391 // extended select. Doing so avoids requiring another full extension of the
18392 // SET_CC result when lowering the select.
18393 static SDValue tryToWidenSetCCOperands(SDNode *Op, SelectionDAG &DAG) {
18394 EVT Op0MVT = Op->getOperand(0).getValueType();
18395 if (!Op0MVT.isVector() || Op->use_empty())
18398 // Make sure that all uses of Op are VSELECTs with result matching types where
18399 // the result type has a larger element type than the SetCC operand.
18400 SDNode *FirstUse = *Op->use_begin();
18401 if (FirstUse->getOpcode() != ISD::VSELECT)
18403 EVT UseMVT = FirstUse->getValueType(0);
18404 if (UseMVT.getScalarSizeInBits() <= Op0MVT.getScalarSizeInBits())
18406 if (any_of(Op->uses(), [&UseMVT](const SDNode *N) {
18407 return N->getOpcode() != ISD::VSELECT || N->getValueType(0) != UseMVT;
18412 if (!ISD::isConstantSplatVector(Op->getOperand(1).getNode(), V))
18418 ISD::CondCode CC = cast<CondCodeSDNode>(Op->getOperand(2))->get();
18419 // Check if the first operand of the SET_CC is already extended. If it is,
18420 // split the SET_CC and re-use the extended version of the operand.
18421 SDNode *Op0SExt = DAG.getNodeIfExists(ISD::SIGN_EXTEND, DAG.getVTList(UseMVT),
18422 Op->getOperand(0));
18423 SDNode *Op0ZExt = DAG.getNodeIfExists(ISD::ZERO_EXTEND, DAG.getVTList(UseMVT),
18424 Op->getOperand(0));
18425 if (Op0SExt && (isSignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
18426 Op0ExtV = SDValue(Op0SExt, 0);
18427 Op1ExtV = DAG.getNode(ISD::SIGN_EXTEND, DL, UseMVT, Op->getOperand(1));
18428 } else if (Op0ZExt && (isUnsignedIntSetCC(CC) || isIntEqualitySetCC(CC))) {
18429 Op0ExtV = SDValue(Op0ZExt, 0);
18430 Op1ExtV = DAG.getNode(ISD::ZERO_EXTEND, DL, UseMVT, Op->getOperand(1));
18434 return DAG.getNode(ISD::SETCC, DL, UseMVT.changeVectorElementType(MVT::i1),
18435 Op0ExtV, Op1ExtV, Op->getOperand(2));
18438 static SDValue performSETCCCombine(SDNode *N,
18439 TargetLowering::DAGCombinerInfo &DCI,
18440 SelectionDAG &DAG) {
18441 assert(N->getOpcode() == ISD::SETCC && "Unexpected opcode!");
18442 SDValue LHS = N->getOperand(0);
18443 SDValue RHS = N->getOperand(1);
18444 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
18446 EVT VT = N->getValueType(0);
18448 if (SDValue V = tryToWidenSetCCOperands(N, DAG))
18451 // setcc (csel 0, 1, cond, X), 1, ne ==> csel 0, 1, !cond, X
18452 if (Cond == ISD::SETNE && isOneConstant(RHS) &&
18453 LHS->getOpcode() == AArch64ISD::CSEL &&
18454 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18455 LHS->hasOneUse()) {
18456 // Invert CSEL's condition.
18457 auto *OpCC = cast<ConstantSDNode>(LHS.getOperand(2));
18458 auto OldCond = static_cast<AArch64CC::CondCode>(OpCC->getZExtValue());
18459 auto NewCond = getInvertedCondCode(OldCond);
18461 // csel 0, 1, !cond, X
18463 DAG.getNode(AArch64ISD::CSEL, DL, LHS.getValueType(), LHS.getOperand(0),
18464 LHS.getOperand(1), DAG.getConstant(NewCond, DL, MVT::i32),
18465 LHS.getOperand(3));
18466 return DAG.getZExtOrTrunc(CSEL, DL, VT);
18469 // setcc (srl x, imm), 0, ne ==> setcc (and x, (-1 << imm)), 0, ne
18470 if (Cond == ISD::SETNE && isNullConstant(RHS) &&
18471 LHS->getOpcode() == ISD::SRL && isa<ConstantSDNode>(LHS->getOperand(1)) &&
18472 LHS->hasOneUse()) {
18473 EVT TstVT = LHS->getValueType(0);
18474 if (TstVT.isScalarInteger() && TstVT.getFixedSizeInBits() <= 64) {
18475 // this pattern will get better opt in emitComparison
18476 uint64_t TstImm = -1ULL << LHS->getConstantOperandVal(1);
18477 SDValue TST = DAG.getNode(ISD::AND, DL, TstVT, LHS->getOperand(0),
18478 DAG.getConstant(TstImm, DL, TstVT));
18479 return DAG.getNode(ISD::SETCC, DL, VT, TST, RHS, N->getOperand(2));
18483 // setcc (iN (bitcast (vNi1 X))), 0, (eq|ne)
18484 // ==> setcc (iN (zext (i1 (vecreduce_or (vNi1 X))))), 0, (eq|ne)
18485 if (DCI.isBeforeLegalize() && VT.isScalarInteger() &&
18486 (Cond == ISD::SETEQ || Cond == ISD::SETNE) && isNullConstant(RHS) &&
18487 LHS->getOpcode() == ISD::BITCAST) {
18488 EVT ToVT = LHS->getValueType(0);
18489 EVT FromVT = LHS->getOperand(0).getValueType();
18490 if (FromVT.isFixedLengthVector() &&
18491 FromVT.getVectorElementType() == MVT::i1) {
18492 LHS = DAG.getNode(ISD::VECREDUCE_OR, DL, MVT::i1, LHS->getOperand(0));
18493 LHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ToVT, LHS);
18494 return DAG.getSetCC(DL, VT, LHS, RHS, Cond);
18501 // Replace a flag-setting operator (eg ANDS) with the generic version
18502 // (eg AND) if the flag is unused.
18503 static SDValue performFlagSettingCombine(SDNode *N,
18504 TargetLowering::DAGCombinerInfo &DCI,
18505 unsigned GenericOpcode) {
18507 SDValue LHS = N->getOperand(0);
18508 SDValue RHS = N->getOperand(1);
18509 EVT VT = N->getValueType(0);
18511 // If the flag result isn't used, convert back to a generic opcode.
18512 if (!N->hasAnyUseOfValue(1)) {
18513 SDValue Res = DCI.DAG.getNode(GenericOpcode, DL, VT, N->ops());
18514 return DCI.DAG.getMergeValues({Res, DCI.DAG.getConstant(0, DL, MVT::i32)},
18518 // Combine identical generic nodes into this node, re-using the result.
18519 if (SDNode *Generic = DCI.DAG.getNodeIfExists(
18520 GenericOpcode, DCI.DAG.getVTList(VT), {LHS, RHS}))
18521 DCI.CombineTo(Generic, SDValue(N, 0));
18526 static SDValue performSetCCPunpkCombine(SDNode *N, SelectionDAG &DAG) {
18527 // setcc_merge_zero pred
18528 // (sign_extend (extract_subvector (setcc_merge_zero ... pred ...))), 0, ne
18529 // => extract_subvector (inner setcc_merge_zero)
18530 SDValue Pred = N->getOperand(0);
18531 SDValue LHS = N->getOperand(1);
18532 SDValue RHS = N->getOperand(2);
18533 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
18535 if (Cond != ISD::SETNE || !isZerosVector(RHS.getNode()) ||
18536 LHS->getOpcode() != ISD::SIGN_EXTEND)
18539 SDValue Extract = LHS->getOperand(0);
18540 if (Extract->getOpcode() != ISD::EXTRACT_SUBVECTOR ||
18541 Extract->getValueType(0) != N->getValueType(0) ||
18542 Extract->getConstantOperandVal(1) != 0)
18545 SDValue InnerSetCC = Extract->getOperand(0);
18546 if (InnerSetCC->getOpcode() != AArch64ISD::SETCC_MERGE_ZERO)
18549 // By this point we've effectively got
18550 // zero_inactive_lanes_and_trunc_i1(sext_i1(A)). If we can prove A's inactive
18551 // lanes are already zero then the trunc(sext()) sequence is redundant and we
18552 // can operate on A directly.
18553 SDValue InnerPred = InnerSetCC.getOperand(0);
18554 if (Pred.getOpcode() == AArch64ISD::PTRUE &&
18555 InnerPred.getOpcode() == AArch64ISD::PTRUE &&
18556 Pred.getConstantOperandVal(0) == InnerPred.getConstantOperandVal(0) &&
18557 Pred->getConstantOperandVal(0) >= AArch64SVEPredPattern::vl1 &&
18558 Pred->getConstantOperandVal(0) <= AArch64SVEPredPattern::vl256)
18565 performSetccMergeZeroCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
18566 assert(N->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
18567 "Unexpected opcode!");
18569 SelectionDAG &DAG = DCI.DAG;
18570 SDValue Pred = N->getOperand(0);
18571 SDValue LHS = N->getOperand(1);
18572 SDValue RHS = N->getOperand(2);
18573 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(3))->get();
18575 if (SDValue V = performSetCCPunpkCombine(N, DAG))
18578 if (Cond == ISD::SETNE && isZerosVector(RHS.getNode()) &&
18579 LHS->getOpcode() == ISD::SIGN_EXTEND &&
18580 LHS->getOperand(0)->getValueType(0) == N->getValueType(0)) {
18581 // setcc_merge_zero(
18582 // pred, extend(setcc_merge_zero(pred, ...)), != splat(0))
18583 // => setcc_merge_zero(pred, ...)
18584 if (LHS->getOperand(0)->getOpcode() == AArch64ISD::SETCC_MERGE_ZERO &&
18585 LHS->getOperand(0)->getOperand(0) == Pred)
18586 return LHS->getOperand(0);
18588 // setcc_merge_zero(
18589 // all_active, extend(nxvNi1 ...), != splat(0))
18591 if (isAllActivePredicate(DAG, Pred))
18592 return LHS->getOperand(0);
18594 // setcc_merge_zero(
18595 // pred, extend(nxvNi1 ...), != splat(0))
18596 // -> nxvNi1 and(pred, ...)
18597 if (DCI.isAfterLegalizeDAG())
18598 // Do this after legalization to allow more folds on setcc_merge_zero
18599 // to be recognized.
18600 return DAG.getNode(ISD::AND, SDLoc(N), N->getValueType(0),
18601 LHS->getOperand(0), Pred);
18607 // Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
18608 // as well as whether the test should be inverted. This code is required to
18609 // catch these cases (as opposed to standard dag combines) because
18610 // AArch64ISD::TBZ is matched during legalization.
18611 static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
18612 SelectionDAG &DAG) {
18614 if (!Op->hasOneUse())
18617 // We don't handle undef/constant-fold cases below, as they should have
18618 // already been taken care of (e.g. and of 0, test of undefined shifted bits,
18621 // (tbz (trunc x), b) -> (tbz x, b)
18622 // This case is just here to enable more of the below cases to be caught.
18623 if (Op->getOpcode() == ISD::TRUNCATE &&
18624 Bit < Op->getValueType(0).getSizeInBits()) {
18625 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
18628 // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
18629 if (Op->getOpcode() == ISD::ANY_EXTEND &&
18630 Bit < Op->getOperand(0).getValueSizeInBits()) {
18631 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
18634 if (Op->getNumOperands() != 2)
18637 auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
18641 switch (Op->getOpcode()) {
18645 // (tbz (and x, m), b) -> (tbz x, b)
18647 if ((C->getZExtValue() >> Bit) & 1)
18648 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
18651 // (tbz (shl x, c), b) -> (tbz x, b-c)
18653 if (C->getZExtValue() <= Bit &&
18654 (Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
18655 Bit = Bit - C->getZExtValue();
18656 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
18660 // (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
18662 Bit = Bit + C->getZExtValue();
18663 if (Bit >= Op->getValueType(0).getSizeInBits())
18664 Bit = Op->getValueType(0).getSizeInBits() - 1;
18665 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
18667 // (tbz (srl x, c), b) -> (tbz x, b+c)
18669 if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
18670 Bit = Bit + C->getZExtValue();
18671 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
18675 // (tbz (xor x, -1), b) -> (tbnz x, b)
18677 if ((C->getZExtValue() >> Bit) & 1)
18679 return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
18683 // Optimize test single bit zero/non-zero and branch.
18684 static SDValue performTBZCombine(SDNode *N,
18685 TargetLowering::DAGCombinerInfo &DCI,
18686 SelectionDAG &DAG) {
18687 unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
18688 bool Invert = false;
18689 SDValue TestSrc = N->getOperand(1);
18690 SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);
18692 if (TestSrc == NewTestSrc)
18695 unsigned NewOpc = N->getOpcode();
18697 if (NewOpc == AArch64ISD::TBZ)
18698 NewOpc = AArch64ISD::TBNZ;
18700 assert(NewOpc == AArch64ISD::TBNZ);
18701 NewOpc = AArch64ISD::TBZ;
18706 return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
18707 DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
18710 // Swap vselect operands where it may allow a predicated operation to achieve
18713 // (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
18714 // => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
18715 static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
18716 auto SelectA = N->getOperand(1);
18717 auto SelectB = N->getOperand(2);
18718 auto NTy = N->getValueType(0);
18720 if (!NTy.isScalableVector())
18722 SDValue SetCC = N->getOperand(0);
18723 if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
18726 switch (SelectB.getOpcode()) {
18734 if (SelectA != SelectB.getOperand(0))
18737 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
18738 ISD::CondCode InverseCC =
18739 ISD::getSetCCInverse(CC, SetCC.getOperand(0).getValueType());
18740 auto InverseSetCC =
18741 DAG.getSetCC(SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
18742 SetCC.getOperand(1), InverseCC);
18744 return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
18745 {InverseSetCC, SelectB, SelectA});
18748 // vselect (v1i1 setcc) ->
18749 // vselect (v1iXX setcc) (XX is the size of the compared operand type)
18750 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
18751 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
18753 static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
18754 if (auto SwapResult = trySwapVSelectOperands(N, DAG))
18757 SDValue N0 = N->getOperand(0);
18758 EVT CCVT = N0.getValueType();
18760 if (isAllActivePredicate(DAG, N0))
18761 return N->getOperand(1);
18763 if (isAllInactivePredicate(N0))
18764 return N->getOperand(2);
18766 // Check for sign pattern (VSELECT setgt, iN lhs, -1, 1, -1) and transform
18767 // into (OR (ASR lhs, N-1), 1), which requires less instructions for the
18768 // supported types.
18769 SDValue SetCC = N->getOperand(0);
18770 if (SetCC.getOpcode() == ISD::SETCC &&
18771 SetCC.getOperand(2) == DAG.getCondCode(ISD::SETGT)) {
18772 SDValue CmpLHS = SetCC.getOperand(0);
18773 EVT VT = CmpLHS.getValueType();
18774 SDNode *CmpRHS = SetCC.getOperand(1).getNode();
18775 SDNode *SplatLHS = N->getOperand(1).getNode();
18776 SDNode *SplatRHS = N->getOperand(2).getNode();
18778 if (CmpLHS.getValueType() == N->getOperand(1).getValueType() &&
18781 makeArrayRef({MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
18782 MVT::v2i32, MVT::v4i32, MVT::v2i64}),
18783 VT.getSimpleVT().SimpleTy) &&
18784 ISD::isConstantSplatVector(SplatLHS, SplatLHSVal) &&
18785 SplatLHSVal.isOne() && ISD::isConstantSplatVectorAllOnes(CmpRHS) &&
18786 ISD::isConstantSplatVectorAllOnes(SplatRHS)) {
18787 unsigned NumElts = VT.getVectorNumElements();
18788 SmallVector<SDValue, 8> Ops(
18789 NumElts, DAG.getConstant(VT.getScalarSizeInBits() - 1, SDLoc(N),
18790 VT.getScalarType()));
18791 SDValue Val = DAG.getBuildVector(VT, SDLoc(N), Ops);
18793 auto Shift = DAG.getNode(ISD::SRA, SDLoc(N), VT, CmpLHS, Val);
18794 auto Or = DAG.getNode(ISD::OR, SDLoc(N), VT, Shift, N->getOperand(1));
18799 if (N0.getOpcode() != ISD::SETCC ||
18800 CCVT.getVectorElementCount() != ElementCount::getFixed(1) ||
18801 CCVT.getVectorElementType() != MVT::i1)
18804 EVT ResVT = N->getValueType(0);
18805 EVT CmpVT = N0.getOperand(0).getValueType();
18806 // Only combine when the result type is of the same size as the compared
18808 if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
18811 SDValue IfTrue = N->getOperand(1);
18812 SDValue IfFalse = N->getOperand(2);
18813 SetCC = DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
18814 N0.getOperand(0), N0.getOperand(1),
18815 cast<CondCodeSDNode>(N0.getOperand(2))->get());
18816 return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
18820 /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
18821 /// the compare-mask instructions rather than going via NZCV, even if LHS and
18822 /// RHS are really scalar. This replaces any scalar setcc in the above pattern
18823 /// with a vector one followed by a DUP shuffle on the result.
18824 static SDValue performSelectCombine(SDNode *N,
18825 TargetLowering::DAGCombinerInfo &DCI) {
18826 SelectionDAG &DAG = DCI.DAG;
18827 SDValue N0 = N->getOperand(0);
18828 EVT ResVT = N->getValueType(0);
18830 if (N0.getOpcode() != ISD::SETCC)
18833 if (ResVT.isScalableVector())
18836 // Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
18837 // scalar SetCCResultType. We also don't expect vectors, because we assume
18838 // that selects fed by vector SETCCs are canonicalized to VSELECT.
18839 assert((N0.getValueType() == MVT::i1 || N0.getValueType() == MVT::i32) &&
18840 "Scalar-SETCC feeding SELECT has unexpected result type!");
18842 // If NumMaskElts == 0, the comparison is larger than select result. The
18843 // largest real NEON comparison is 64-bits per lane, which means the result is
18844 // at most 32-bits and an illegal vector. Just bail out for now.
18845 EVT SrcVT = N0.getOperand(0).getValueType();
18847 // Don't try to do this optimization when the setcc itself has i1 operands.
18848 // There are no legal vectors of i1, so this would be pointless.
18849 if (SrcVT == MVT::i1)
18852 int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
18853 if (!ResVT.isVector() || NumMaskElts == 0)
18856 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
18857 EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
18859 // Also bail out if the vector CCVT isn't the same size as ResVT.
18860 // This can happen if the SETCC operand size doesn't divide the ResVT size
18861 // (e.g., f64 vs v3f32).
18862 if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
18865 // Make sure we didn't create illegal types, if we're not supposed to.
18866 assert(DCI.isBeforeLegalize() ||
18867 DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));
18869 // First perform a vector comparison, where lane 0 is the one we're interested
18873 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
18875 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
18876 SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
18878 // Now duplicate the comparison mask we want across all other lanes.
18879 SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
18880 SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
18881 Mask = DAG.getNode(ISD::BITCAST, DL,
18882 ResVT.changeVectorElementTypeToInteger(), Mask);
18884 return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
18887 static SDValue performDUPCombine(SDNode *N,
18888 TargetLowering::DAGCombinerInfo &DCI) {
18889 EVT VT = N->getValueType(0);
18890 // If "v2i32 DUP(x)" and "v4i32 DUP(x)" both exist, use an extract from the
18891 // 128bit vector version.
18892 if (VT.is64BitVector() && DCI.isAfterLegalizeDAG()) {
18893 EVT LVT = VT.getDoubleNumVectorElementsVT(*DCI.DAG.getContext());
18894 if (SDNode *LN = DCI.DAG.getNodeIfExists(
18895 N->getOpcode(), DCI.DAG.getVTList(LVT), {N->getOperand(0)})) {
18897 return DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SDValue(LN, 0),
18898 DCI.DAG.getConstant(0, DL, MVT::i64));
18902 return performPostLD1Combine(N, DCI, false);
18905 /// Get rid of unnecessary NVCASTs (that don't change the type).
18906 static SDValue performNVCASTCombine(SDNode *N) {
18907 if (N->getValueType(0) == N->getOperand(0).getValueType())
18908 return N->getOperand(0);
18913 // If all users of the globaladdr are of the form (globaladdr + constant), find
18914 // the smallest constant, fold it into the globaladdr's offset and rewrite the
18915 // globaladdr as (globaladdr + constant) - constant.
18916 static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
18917 const AArch64Subtarget *Subtarget,
18918 const TargetMachine &TM) {
18919 auto *GN = cast<GlobalAddressSDNode>(N);
18920 if (Subtarget->ClassifyGlobalReference(GN->getGlobal(), TM) !=
18921 AArch64II::MO_NO_FLAG)
18924 uint64_t MinOffset = -1ull;
18925 for (SDNode *N : GN->uses()) {
18926 if (N->getOpcode() != ISD::ADD)
18928 auto *C = dyn_cast<ConstantSDNode>(N->getOperand(0));
18930 C = dyn_cast<ConstantSDNode>(N->getOperand(1));
18933 MinOffset = std::min(MinOffset, C->getZExtValue());
18935 uint64_t Offset = MinOffset + GN->getOffset();
18937 // Require that the new offset is larger than the existing one. Otherwise, we
18938 // can end up oscillating between two possible DAGs, for example,
18939 // (add (add globaladdr + 10, -1), 1) and (add globaladdr + 9, 1).
18940 if (Offset <= uint64_t(GN->getOffset()))
18943 // Check whether folding this offset is legal. It must not go out of bounds of
18944 // the referenced object to avoid violating the code model, and must be
18945 // smaller than 2^20 because this is the largest offset expressible in all
18946 // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
18947 // stores an immediate signed 21 bit offset.)
18949 // This check also prevents us from folding negative offsets, which will end
18950 // up being treated in the same way as large positive ones. They could also
18951 // cause code model violations, and aren't really common enough to matter.
18952 if (Offset >= (1 << 20))
18955 const GlobalValue *GV = GN->getGlobal();
18956 Type *T = GV->getValueType();
18957 if (!T->isSized() ||
18958 Offset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
18962 SDValue Result = DAG.getGlobalAddress(GV, DL, MVT::i64, Offset);
18963 return DAG.getNode(ISD::SUB, DL, MVT::i64, Result,
18964 DAG.getConstant(MinOffset, DL, MVT::i64));
18967 // Turns the vector of indices into a vector of byte offstes by scaling Offset
18968 // by (BitWidth / 8).
18969 static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
18970 SDLoc DL, unsigned BitWidth) {
18971 assert(Offset.getValueType().isScalableVector() &&
18972 "This method is only for scalable vectors of offsets");
18974 SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
18975 SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
18977 return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
18980 /// Check if the value of \p OffsetInBytes can be used as an immediate for
18981 /// the gather load/prefetch and scatter store instructions with vector base and
18982 /// immediate offset addressing mode:
18984 /// [<Zn>.[S|D]{, #<imm>}]
18986 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
18987 inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
18988 unsigned ScalarSizeInBytes) {
18989 // The immediate is not a multiple of the scalar size.
18990 if (OffsetInBytes % ScalarSizeInBytes)
18993 // The immediate is out of range.
18994 if (OffsetInBytes / ScalarSizeInBytes > 31)
19000 /// Check if the value of \p Offset represents a valid immediate for the SVE
19001 /// gather load/prefetch and scatter store instructiona with vector base and
19002 /// immediate offset addressing mode:
19004 /// [<Zn>.[S|D]{, #<imm>}]
19006 /// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
19007 static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
19008 unsigned ScalarSizeInBytes) {
19009 ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
19010 return OffsetConst && isValidImmForSVEVecImmAddrMode(
19011 OffsetConst->getZExtValue(), ScalarSizeInBytes);
19014 static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
19016 bool OnlyPackedOffsets = true) {
19017 const SDValue Src = N->getOperand(2);
19018 const EVT SrcVT = Src->getValueType(0);
19019 assert(SrcVT.isScalableVector() &&
19020 "Scatter stores are only possible for SVE vectors");
19023 MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
19025 // Make sure that source data will fit into an SVE register
19026 if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
19029 // For FPs, ACLE only supports _packed_ single and double precision types.
19030 if (SrcElVT.isFloatingPoint())
19031 if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
19034 // Depending on the addressing mode, this is either a pointer or a vector of
19035 // pointers (that fits into one register)
19036 SDValue Base = N->getOperand(4);
19037 // Depending on the addressing mode, this is either a single offset or a
19038 // vector of offsets (that fits into one register)
19039 SDValue Offset = N->getOperand(5);
19041 // For "scalar + vector of indices", just scale the indices. This only
19042 // applies to non-temporal scatters because there's no instruction that takes
19044 if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
19046 getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
19047 Opcode = AArch64ISD::SSTNT1_PRED;
19050 // In the case of non-temporal gather loads there's only one SVE instruction
19051 // per data-size: "scalar + vector", i.e.
19052 // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
19053 // Since we do have intrinsics that allow the arguments to be in a different
19054 // order, we may need to swap them to match the spec.
19055 if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
19056 std::swap(Base, Offset);
19058 // SST1_IMM requires that the offset is an immediate that is:
19059 // * a multiple of #SizeInBytes,
19060 // * in the range [0, 31 x #SizeInBytes],
19061 // where #SizeInBytes is the size in bytes of the stored items. For
19062 // immediates outside that range and non-immediate scalar offsets use SST1 or
19063 // SST1_UXTW instead.
19064 if (Opcode == AArch64ISD::SST1_IMM_PRED) {
19065 if (!isValidImmForSVEVecImmAddrMode(Offset,
19066 SrcVT.getScalarSizeInBits() / 8)) {
19067 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
19068 Opcode = AArch64ISD::SST1_UXTW_PRED;
19070 Opcode = AArch64ISD::SST1_PRED;
19072 std::swap(Base, Offset);
19076 auto &TLI = DAG.getTargetLoweringInfo();
19077 if (!TLI.isTypeLegal(Base.getValueType()))
19080 // Some scatter store variants allow unpacked offsets, but only as nxv2i32
19081 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
19082 // nxv2i64. Legalize accordingly.
19083 if (!OnlyPackedOffsets &&
19084 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
19085 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
19087 if (!TLI.isTypeLegal(Offset.getValueType()))
19090 // Source value type that is representable in hardware
19091 EVT HwSrcVt = getSVEContainerType(SrcVT);
19093 // Keep the original type of the input data to store - this is needed to be
19094 // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
19095 // FP values we want the integer equivalent, so just use HwSrcVt.
19096 SDValue InputVT = DAG.getValueType(SrcVT);
19097 if (SrcVT.isFloatingPoint())
19098 InputVT = DAG.getValueType(HwSrcVt);
19100 SDVTList VTs = DAG.getVTList(MVT::Other);
19103 if (Src.getValueType().isFloatingPoint())
19104 SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
19106 SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
19108 SDValue Ops[] = {N->getOperand(0), // Chain
19110 N->getOperand(3), // Pg
19115 return DAG.getNode(Opcode, DL, VTs, Ops);
19118 static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
19120 bool OnlyPackedOffsets = true) {
19121 const EVT RetVT = N->getValueType(0);
19122 assert(RetVT.isScalableVector() &&
19123 "Gather loads are only possible for SVE vectors");
19127 // Make sure that the loaded data will fit into an SVE register
19128 if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
19131 // Depending on the addressing mode, this is either a pointer or a vector of
19132 // pointers (that fits into one register)
19133 SDValue Base = N->getOperand(3);
19134 // Depending on the addressing mode, this is either a single offset or a
19135 // vector of offsets (that fits into one register)
19136 SDValue Offset = N->getOperand(4);
19138 // For "scalar + vector of indices", just scale the indices. This only
19139 // applies to non-temporal gathers because there's no instruction that takes
19141 if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
19142 Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
19143 RetVT.getScalarSizeInBits());
19144 Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
19147 // In the case of non-temporal gather loads there's only one SVE instruction
19148 // per data-size: "scalar + vector", i.e.
19149 // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
19150 // Since we do have intrinsics that allow the arguments to be in a different
19151 // order, we may need to swap them to match the spec.
19152 if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
19153 Offset.getValueType().isVector())
19154 std::swap(Base, Offset);
19156 // GLD{FF}1_IMM requires that the offset is an immediate that is:
19157 // * a multiple of #SizeInBytes,
19158 // * in the range [0, 31 x #SizeInBytes],
19159 // where #SizeInBytes is the size in bytes of the loaded items. For
19160 // immediates outside that range and non-immediate scalar offsets use
19161 // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
19162 if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
19163 Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
19164 if (!isValidImmForSVEVecImmAddrMode(Offset,
19165 RetVT.getScalarSizeInBits() / 8)) {
19166 if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
19167 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
19168 ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
19169 : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
19171 Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
19172 ? AArch64ISD::GLD1_MERGE_ZERO
19173 : AArch64ISD::GLDFF1_MERGE_ZERO;
19175 std::swap(Base, Offset);
19179 auto &TLI = DAG.getTargetLoweringInfo();
19180 if (!TLI.isTypeLegal(Base.getValueType()))
19183 // Some gather load variants allow unpacked offsets, but only as nxv2i32
19184 // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
19185 // nxv2i64. Legalize accordingly.
19186 if (!OnlyPackedOffsets &&
19187 Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
19188 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
19190 // Return value type that is representable in hardware
19191 EVT HwRetVt = getSVEContainerType(RetVT);
19193 // Keep the original output value type around - this is needed to be able to
19194 // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
19195 // values we want the integer equivalent, so just use HwRetVT.
19196 SDValue OutVT = DAG.getValueType(RetVT);
19197 if (RetVT.isFloatingPoint())
19198 OutVT = DAG.getValueType(HwRetVt);
19200 SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
19201 SDValue Ops[] = {N->getOperand(0), // Chain
19202 N->getOperand(2), // Pg
19203 Base, Offset, OutVT};
19205 SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
19206 SDValue LoadChain = SDValue(Load.getNode(), 1);
19208 if (RetVT.isInteger() && (RetVT != HwRetVt))
19209 Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
19211 // If the original return value was FP, bitcast accordingly. Doing it here
19212 // means that we can avoid adding TableGen patterns for FPs.
19213 if (RetVT.isFloatingPoint())
19214 Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
19216 return DAG.getMergeValues({Load, LoadChain}, DL);
19220 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
19221 SelectionDAG &DAG) {
19223 SDValue Src = N->getOperand(0);
19224 unsigned Opc = Src->getOpcode();
19226 // Sign extend of an unsigned unpack -> signed unpack
19227 if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
19229 unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
19230 : AArch64ISD::SUNPKLO;
19232 // Push the sign extend to the operand of the unpack
19233 // This is necessary where, for example, the operand of the unpack
19234 // is another unpack:
19235 // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
19237 // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
19239 // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
19240 SDValue ExtOp = Src->getOperand(0);
19241 auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
19242 EVT EltTy = VT.getVectorElementType();
19245 assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
19246 "Sign extending from an invalid type");
19248 EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());
19250 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
19251 ExtOp, DAG.getValueType(ExtVT));
19253 return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
19256 if (DCI.isBeforeLegalizeOps())
19259 if (!EnableCombineMGatherIntrinsics)
19262 // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
19263 // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
19265 unsigned MemVTOpNum = 4;
19267 case AArch64ISD::LD1_MERGE_ZERO:
19268 NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
19271 case AArch64ISD::LDNF1_MERGE_ZERO:
19272 NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
19275 case AArch64ISD::LDFF1_MERGE_ZERO:
19276 NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
19279 case AArch64ISD::GLD1_MERGE_ZERO:
19280 NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
19282 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
19283 NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
19285 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
19286 NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
19288 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
19289 NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
19291 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
19292 NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
19294 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
19295 NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
19297 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
19298 NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
19300 case AArch64ISD::GLDFF1_MERGE_ZERO:
19301 NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
19303 case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
19304 NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
19306 case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
19307 NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
19309 case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
19310 NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
19312 case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
19313 NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
19315 case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
19316 NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
19318 case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
19319 NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
19321 case AArch64ISD::GLDNT1_MERGE_ZERO:
19322 NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
19328 EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
19329 EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
19331 if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
19334 EVT DstVT = N->getValueType(0);
19335 SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
19337 SmallVector<SDValue, 5> Ops;
19338 for (unsigned I = 0; I < Src->getNumOperands(); ++I)
19339 Ops.push_back(Src->getOperand(I));
19341 SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
19342 DCI.CombineTo(N, ExtLoad);
19343 DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
19345 // Return N so it doesn't get rechecked
19346 return SDValue(N, 0);
19349 /// Legalize the gather prefetch (scalar + vector addressing mode) when the
19350 /// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
19351 /// != nxv2i32) do not need legalization.
19352 static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
19353 const unsigned OffsetPos = 4;
19354 SDValue Offset = N->getOperand(OffsetPos);
19356 // Not an unpacked vector, bail out.
19357 if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
19360 // Extend the unpacked offset vector to 64-bit lanes.
19362 Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
19363 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
19364 // Replace the offset operand with the 64-bit one.
19365 Ops[OffsetPos] = Offset;
19367 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
19370 /// Combines a node carrying the intrinsic
19371 /// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
19372 /// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
19373 /// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
19374 /// sve gather prefetch instruction with vector plus immediate addressing mode.
19375 static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
19376 unsigned ScalarSizeInBytes) {
19377 const unsigned ImmPos = 4, OffsetPos = 3;
19378 // No need to combine the node if the immediate is valid...
19379 if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
19382 // ...otherwise swap the offset base with the offset...
19383 SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
19384 std::swap(Ops[ImmPos], Ops[OffsetPos]);
19385 // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
19386 // `aarch64_sve_prfb_gather_uxtw_index`.
19388 Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
19391 return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
19394 // Return true if the vector operation can guarantee only the first lane of its
19395 // result contains data, with all bits in other lanes set to zero.
19396 static bool isLanes1toNKnownZero(SDValue Op) {
19397 switch (Op.getOpcode()) {
19400 case AArch64ISD::ANDV_PRED:
19401 case AArch64ISD::EORV_PRED:
19402 case AArch64ISD::FADDA_PRED:
19403 case AArch64ISD::FADDV_PRED:
19404 case AArch64ISD::FMAXNMV_PRED:
19405 case AArch64ISD::FMAXV_PRED:
19406 case AArch64ISD::FMINNMV_PRED:
19407 case AArch64ISD::FMINV_PRED:
19408 case AArch64ISD::ORV_PRED:
19409 case AArch64ISD::SADDV_PRED:
19410 case AArch64ISD::SMAXV_PRED:
19411 case AArch64ISD::SMINV_PRED:
19412 case AArch64ISD::UADDV_PRED:
19413 case AArch64ISD::UMAXV_PRED:
19414 case AArch64ISD::UMINV_PRED:
19419 static SDValue removeRedundantInsertVectorElt(SDNode *N) {
19420 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT && "Unexpected node!");
19421 SDValue InsertVec = N->getOperand(0);
19422 SDValue InsertElt = N->getOperand(1);
19423 SDValue InsertIdx = N->getOperand(2);
19425 // We only care about inserts into the first element...
19426 if (!isNullConstant(InsertIdx))
19428 // ...of a zero'd vector...
19429 if (!ISD::isConstantSplatVectorAllZeros(InsertVec.getNode()))
19431 // ...where the inserted data was previously extracted...
19432 if (InsertElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
19435 SDValue ExtractVec = InsertElt.getOperand(0);
19436 SDValue ExtractIdx = InsertElt.getOperand(1);
19438 // ...from the first element of a vector.
19439 if (!isNullConstant(ExtractIdx))
19442 // If we get here we are effectively trying to zero lanes 1-N of a vector.
19444 // Ensure there's no type conversion going on.
19445 if (N->getValueType(0) != ExtractVec.getValueType())
19448 if (!isLanes1toNKnownZero(ExtractVec))
19451 // The explicit zeroing is redundant.
19456 performInsertVectorEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
19457 if (SDValue Res = removeRedundantInsertVectorElt(N))
19460 return performPostLD1Combine(N, DCI, true);
19463 static SDValue performSVESpliceCombine(SDNode *N, SelectionDAG &DAG) {
19464 EVT Ty = N->getValueType(0);
19465 if (Ty.isInteger())
19468 EVT IntTy = Ty.changeVectorElementTypeToInteger();
19469 EVT ExtIntTy = getPackedSVEVectorVT(IntTy.getVectorElementCount());
19470 if (ExtIntTy.getVectorElementType().getScalarSizeInBits() <
19471 IntTy.getVectorElementType().getScalarSizeInBits())
19475 SDValue LHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(0)),
19477 SDValue RHS = DAG.getAnyExtOrTrunc(DAG.getBitcast(IntTy, N->getOperand(1)),
19479 SDValue Idx = N->getOperand(2);
19480 SDValue Splice = DAG.getNode(ISD::VECTOR_SPLICE, DL, ExtIntTy, LHS, RHS, Idx);
19481 SDValue Trunc = DAG.getAnyExtOrTrunc(Splice, DL, IntTy);
19482 return DAG.getBitcast(Ty, Trunc);
19485 static SDValue performFPExtendCombine(SDNode *N, SelectionDAG &DAG,
19486 TargetLowering::DAGCombinerInfo &DCI,
19487 const AArch64Subtarget *Subtarget) {
19488 SDValue N0 = N->getOperand(0);
19489 EVT VT = N->getValueType(0);
19491 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
19492 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::FP_ROUND)
19495 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
19496 // We purposefully don't care about legality of the nodes here as we know
19497 // they can be split down into something legal.
19498 if (DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(N0.getNode()) &&
19499 N0.hasOneUse() && Subtarget->useSVEForFixedLengthVectors() &&
19500 VT.isFixedLengthVector() &&
19501 VT.getFixedSizeInBits() >= Subtarget->getMinSVEVectorSizeInBits()) {
19502 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
19503 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
19504 LN0->getChain(), LN0->getBasePtr(),
19505 N0.getValueType(), LN0->getMemOperand());
19506 DCI.CombineTo(N, ExtLoad);
19509 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
19510 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
19511 ExtLoad.getValue(1));
19512 return SDValue(N, 0); // Return N so it doesn't get rechecked!
19518 static SDValue performBSPExpandForSVE(SDNode *N, SelectionDAG &DAG,
19519 const AArch64Subtarget *Subtarget,
19520 bool fixedSVEVectorVT) {
19521 EVT VT = N->getValueType(0);
19523 // Don't expand for SVE2
19524 if (!VT.isScalableVector() || Subtarget->hasSVE2() || Subtarget->hasSME())
19527 // Don't expand for NEON
19528 if (VT.isFixedLengthVector() && !fixedSVEVectorVT)
19533 SDValue Mask = N->getOperand(0);
19534 SDValue In1 = N->getOperand(1);
19535 SDValue In2 = N->getOperand(2);
19537 SDValue InvMask = DAG.getNOT(DL, Mask, VT);
19538 SDValue Sel = DAG.getNode(ISD::AND, DL, VT, Mask, In1);
19539 SDValue SelInv = DAG.getNode(ISD::AND, DL, VT, InvMask, In2);
19540 return DAG.getNode(ISD::OR, DL, VT, Sel, SelInv);
19543 static SDValue performDupLane128Combine(SDNode *N, SelectionDAG &DAG) {
19544 EVT VT = N->getValueType(0);
19546 SDValue Insert = N->getOperand(0);
19547 if (Insert.getOpcode() != ISD::INSERT_SUBVECTOR)
19550 if (!Insert.getOperand(0).isUndef())
19553 uint64_t IdxInsert = Insert.getConstantOperandVal(2);
19554 uint64_t IdxDupLane = N->getConstantOperandVal(1);
19555 if (IdxInsert != 0 || IdxDupLane != 0)
19558 SDValue Bitcast = Insert.getOperand(1);
19559 if (Bitcast.getOpcode() != ISD::BITCAST)
19562 SDValue Subvec = Bitcast.getOperand(0);
19563 EVT SubvecVT = Subvec.getValueType();
19564 if (!SubvecVT.is128BitVector())
19567 getPackedSVEVectorVT(Subvec.getValueType().getVectorElementType());
19570 SDValue NewInsert =
19571 DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewSubvecVT,
19572 DAG.getUNDEF(NewSubvecVT), Subvec, Insert->getOperand(2));
19573 SDValue NewDuplane128 = DAG.getNode(AArch64ISD::DUPLANE128, DL, NewSubvecVT,
19574 NewInsert, N->getOperand(1));
19575 return DAG.getNode(ISD::BITCAST, DL, VT, NewDuplane128);
19578 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
19579 DAGCombinerInfo &DCI) const {
19580 SelectionDAG &DAG = DCI.DAG;
19581 switch (N->getOpcode()) {
19583 LLVM_DEBUG(dbgs() << "Custom combining: skipping\n");
19587 return performAddSubCombine(N, DCI, DAG);
19588 case ISD::BUILD_VECTOR:
19589 return performBuildVectorCombine(N, DCI, DAG);
19590 case AArch64ISD::ANDS:
19591 return performFlagSettingCombine(N, DCI, ISD::AND);
19592 case AArch64ISD::ADC:
19593 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
19595 return foldADCToCINC(N, DAG);
19596 case AArch64ISD::SBC:
19597 return foldOverflowCheck(N, DAG, /* IsAdd */ false);
19598 case AArch64ISD::ADCS:
19599 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ true))
19601 return performFlagSettingCombine(N, DCI, AArch64ISD::ADC);
19602 case AArch64ISD::SBCS:
19603 if (auto R = foldOverflowCheck(N, DAG, /* IsAdd */ false))
19605 return performFlagSettingCombine(N, DCI, AArch64ISD::SBC);
19607 return performXorCombine(N, DAG, DCI, Subtarget);
19609 return performMulCombine(N, DAG, DCI, Subtarget);
19610 case ISD::SINT_TO_FP:
19611 case ISD::UINT_TO_FP:
19612 return performIntToFpCombine(N, DAG, Subtarget);
19613 case ISD::FP_TO_SINT:
19614 case ISD::FP_TO_UINT:
19615 case ISD::FP_TO_SINT_SAT:
19616 case ISD::FP_TO_UINT_SAT:
19617 return performFpToIntCombine(N, DAG, DCI, Subtarget);
19619 return performFDivCombine(N, DAG, DCI, Subtarget);
19621 return performORCombine(N, DCI, Subtarget);
19623 return performANDCombine(N, DCI);
19624 case ISD::INTRINSIC_WO_CHAIN:
19625 return performIntrinsicCombine(N, DCI, Subtarget);
19626 case ISD::ANY_EXTEND:
19627 case ISD::ZERO_EXTEND:
19628 case ISD::SIGN_EXTEND:
19629 return performExtendCombine(N, DCI, DAG);
19630 case ISD::SIGN_EXTEND_INREG:
19631 return performSignExtendInRegCombine(N, DCI, DAG);
19632 case ISD::CONCAT_VECTORS:
19633 return performConcatVectorsCombine(N, DCI, DAG);
19634 case ISD::EXTRACT_SUBVECTOR:
19635 return performExtractSubvectorCombine(N, DCI, DAG);
19636 case ISD::INSERT_SUBVECTOR:
19637 return performInsertSubvectorCombine(N, DCI, DAG);
19639 return performSelectCombine(N, DCI);
19641 return performVSelectCombine(N, DCI.DAG);
19643 return performSETCCCombine(N, DCI, DAG);
19645 if (performTBISimplification(N->getOperand(1), DCI, DAG))
19646 return SDValue(N, 0);
19649 return performSTORECombine(N, DCI, DAG, Subtarget);
19651 return performMSTORECombine(N, DCI, DAG, Subtarget);
19653 case ISD::MSCATTER:
19654 return performMaskedGatherScatterCombine(N, DCI, DAG);
19655 case ISD::VECTOR_SPLICE:
19656 return performSVESpliceCombine(N, DAG);
19657 case ISD::FP_EXTEND:
19658 return performFPExtendCombine(N, DAG, DCI, Subtarget);
19659 case AArch64ISD::BRCOND:
19660 return performBRCONDCombine(N, DCI, DAG);
19661 case AArch64ISD::TBNZ:
19662 case AArch64ISD::TBZ:
19663 return performTBZCombine(N, DCI, DAG);
19664 case AArch64ISD::CSEL:
19665 return performCSELCombine(N, DCI, DAG);
19666 case AArch64ISD::DUP:
19667 return performDUPCombine(N, DCI);
19668 case AArch64ISD::DUPLANE128:
19669 return performDupLane128Combine(N, DAG);
19670 case AArch64ISD::NVCAST:
19671 return performNVCASTCombine(N);
19672 case AArch64ISD::SPLICE:
19673 return performSpliceCombine(N, DAG);
19674 case AArch64ISD::UUNPKLO:
19675 case AArch64ISD::UUNPKHI:
19676 return performUnpackCombine(N, DAG, Subtarget);
19677 case AArch64ISD::UZP1:
19678 return performUzpCombine(N, DAG);
19679 case AArch64ISD::SETCC_MERGE_ZERO:
19680 return performSetccMergeZeroCombine(N, DCI);
19681 case AArch64ISD::GLD1_MERGE_ZERO:
19682 case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
19683 case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
19684 case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
19685 case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
19686 case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
19687 case AArch64ISD::GLD1_IMM_MERGE_ZERO:
19688 case AArch64ISD::GLD1S_MERGE_ZERO:
19689 case AArch64ISD::GLD1S_SCALED_MERGE_ZERO:
19690 case AArch64ISD::GLD1S_UXTW_MERGE_ZERO:
19691 case AArch64ISD::GLD1S_SXTW_MERGE_ZERO:
19692 case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO:
19693 case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO:
19694 case AArch64ISD::GLD1S_IMM_MERGE_ZERO:
19695 return performGLD1Combine(N, DAG);
19696 case AArch64ISD::VASHR:
19697 case AArch64ISD::VLSHR:
19698 return performVectorShiftCombine(N, *this, DCI);
19699 case AArch64ISD::SUNPKLO:
19700 return performSunpkloCombine(N, DAG);
19701 case AArch64ISD::BSP:
19702 return performBSPExpandForSVE(
19703 N, DAG, Subtarget, useSVEForFixedLengthVectorVT(N->getValueType(0)));
19704 case ISD::INSERT_VECTOR_ELT:
19705 return performInsertVectorEltCombine(N, DCI);
19706 case ISD::EXTRACT_VECTOR_ELT:
19707 return performExtractVectorEltCombine(N, DCI, Subtarget);
19708 case ISD::VECREDUCE_ADD:
19709 return performVecReduceAddCombine(N, DCI.DAG, Subtarget);
19710 case AArch64ISD::UADDV:
19711 return performUADDVCombine(N, DAG);
19712 case AArch64ISD::SMULL:
19713 case AArch64ISD::UMULL:
19714 return tryCombineLongOpWithDup(Intrinsic::not_intrinsic, N, DCI, DAG);
19715 case ISD::INTRINSIC_VOID:
19716 case ISD::INTRINSIC_W_CHAIN:
19717 switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
19718 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
19719 return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
19720 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
19721 return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
19722 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
19723 return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
19724 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
19725 return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
19726 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
19727 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
19728 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
19729 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
19730 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
19731 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
19732 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
19733 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
19734 return legalizeSVEGatherPrefetchOffsVec(N, DAG);
19735 case Intrinsic::aarch64_neon_ld2:
19736 case Intrinsic::aarch64_neon_ld3:
19737 case Intrinsic::aarch64_neon_ld4:
19738 case Intrinsic::aarch64_neon_ld1x2:
19739 case Intrinsic::aarch64_neon_ld1x3:
19740 case Intrinsic::aarch64_neon_ld1x4:
19741 case Intrinsic::aarch64_neon_ld2lane:
19742 case Intrinsic::aarch64_neon_ld3lane:
19743 case Intrinsic::aarch64_neon_ld4lane:
19744 case Intrinsic::aarch64_neon_ld2r:
19745 case Intrinsic::aarch64_neon_ld3r:
19746 case Intrinsic::aarch64_neon_ld4r:
19747 case Intrinsic::aarch64_neon_st2:
19748 case Intrinsic::aarch64_neon_st3:
19749 case Intrinsic::aarch64_neon_st4:
19750 case Intrinsic::aarch64_neon_st1x2:
19751 case Intrinsic::aarch64_neon_st1x3:
19752 case Intrinsic::aarch64_neon_st1x4:
19753 case Intrinsic::aarch64_neon_st2lane:
19754 case Intrinsic::aarch64_neon_st3lane:
19755 case Intrinsic::aarch64_neon_st4lane:
19756 return performNEONPostLDSTCombine(N, DCI, DAG);
19757 case Intrinsic::aarch64_sve_ldnt1:
19758 return performLDNT1Combine(N, DAG);
19759 case Intrinsic::aarch64_sve_ld1rq:
19760 return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
19761 case Intrinsic::aarch64_sve_ld1ro:
19762 return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
19763 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
19764 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
19765 case Intrinsic::aarch64_sve_ldnt1_gather:
19766 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
19767 case Intrinsic::aarch64_sve_ldnt1_gather_index:
19768 return performGatherLoadCombine(N, DAG,
19769 AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
19770 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
19771 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
19772 case Intrinsic::aarch64_sve_ld1:
19773 return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
19774 case Intrinsic::aarch64_sve_ldnf1:
19775 return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
19776 case Intrinsic::aarch64_sve_ldff1:
19777 return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
19778 case Intrinsic::aarch64_sve_st1:
19779 return performST1Combine(N, DAG);
19780 case Intrinsic::aarch64_sve_stnt1:
19781 return performSTNT1Combine(N, DAG);
19782 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
19783 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
19784 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
19785 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
19786 case Intrinsic::aarch64_sve_stnt1_scatter:
19787 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
19788 case Intrinsic::aarch64_sve_stnt1_scatter_index:
19789 return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
19790 case Intrinsic::aarch64_sve_ld1_gather:
19791 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
19792 case Intrinsic::aarch64_sve_ld1_gather_index:
19793 return performGatherLoadCombine(N, DAG,
19794 AArch64ISD::GLD1_SCALED_MERGE_ZERO);
19795 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
19796 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
19797 /*OnlyPackedOffsets=*/false);
19798 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
19799 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
19800 /*OnlyPackedOffsets=*/false);
19801 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
19802 return performGatherLoadCombine(N, DAG,
19803 AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
19804 /*OnlyPackedOffsets=*/false);
19805 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
19806 return performGatherLoadCombine(N, DAG,
19807 AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
19808 /*OnlyPackedOffsets=*/false);
19809 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
19810 return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
19811 case Intrinsic::aarch64_sve_ldff1_gather:
19812 return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
19813 case Intrinsic::aarch64_sve_ldff1_gather_index:
19814 return performGatherLoadCombine(N, DAG,
19815 AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
19816 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
19817 return performGatherLoadCombine(N, DAG,
19818 AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
19819 /*OnlyPackedOffsets=*/false);
19820 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
19821 return performGatherLoadCombine(N, DAG,
19822 AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
19823 /*OnlyPackedOffsets=*/false);
19824 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
19825 return performGatherLoadCombine(N, DAG,
19826 AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
19827 /*OnlyPackedOffsets=*/false);
19828 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
19829 return performGatherLoadCombine(N, DAG,
19830 AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
19831 /*OnlyPackedOffsets=*/false);
19832 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
19833 return performGatherLoadCombine(N, DAG,
19834 AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
19835 case Intrinsic::aarch64_sve_st1_scatter:
19836 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
19837 case Intrinsic::aarch64_sve_st1_scatter_index:
19838 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
19839 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
19840 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
19841 /*OnlyPackedOffsets=*/false);
19842 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
19843 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
19844 /*OnlyPackedOffsets=*/false);
19845 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
19846 return performScatterStoreCombine(N, DAG,
19847 AArch64ISD::SST1_SXTW_SCALED_PRED,
19848 /*OnlyPackedOffsets=*/false);
19849 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
19850 return performScatterStoreCombine(N, DAG,
19851 AArch64ISD::SST1_UXTW_SCALED_PRED,
19852 /*OnlyPackedOffsets=*/false);
19853 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
19854 return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
19855 case Intrinsic::aarch64_sve_tuple_get: {
19857 SDValue Chain = N->getOperand(0);
19858 SDValue Src1 = N->getOperand(2);
19859 SDValue Idx = N->getOperand(3);
19861 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
19862 EVT ResVT = N->getValueType(0);
19863 uint64_t NumLanes = ResVT.getVectorElementCount().getKnownMinValue();
19864 SDValue ExtIdx = DAG.getVectorIdxConstant(IdxConst * NumLanes, DL);
19866 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1, ExtIdx);
19867 return DAG.getMergeValues({Val, Chain}, DL);
19869 case Intrinsic::aarch64_sve_tuple_set: {
19871 SDValue Chain = N->getOperand(0);
19872 SDValue Tuple = N->getOperand(2);
19873 SDValue Idx = N->getOperand(3);
19874 SDValue Vec = N->getOperand(4);
19876 EVT TupleVT = Tuple.getValueType();
19877 uint64_t TupleLanes = TupleVT.getVectorElementCount().getKnownMinValue();
19879 uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
19880 uint64_t NumLanes =
19881 Vec.getValueType().getVectorElementCount().getKnownMinValue();
19883 if ((TupleLanes % NumLanes) != 0)
19884 report_fatal_error("invalid tuple vector!");
19886 uint64_t NumVecs = TupleLanes / NumLanes;
19888 SmallVector<SDValue, 4> Opnds;
19889 for (unsigned I = 0; I < NumVecs; ++I) {
19891 Opnds.push_back(Vec);
19893 SDValue ExtIdx = DAG.getVectorIdxConstant(I * NumLanes, DL);
19894 Opnds.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL,
19895 Vec.getValueType(), Tuple, ExtIdx));
19899 DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
19900 return DAG.getMergeValues({Concat, Chain}, DL);
19902 case Intrinsic::aarch64_sve_tuple_create2:
19903 case Intrinsic::aarch64_sve_tuple_create3:
19904 case Intrinsic::aarch64_sve_tuple_create4: {
19906 SDValue Chain = N->getOperand(0);
19908 SmallVector<SDValue, 4> Opnds;
19909 for (unsigned I = 2; I < N->getNumOperands(); ++I)
19910 Opnds.push_back(N->getOperand(I));
19912 EVT VT = Opnds[0].getValueType();
19913 EVT EltVT = VT.getVectorElementType();
19914 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
19915 VT.getVectorElementCount() *
19916 (N->getNumOperands() - 2));
19917 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
19918 return DAG.getMergeValues({Concat, Chain}, DL);
19920 case Intrinsic::aarch64_sve_ld2:
19921 case Intrinsic::aarch64_sve_ld3:
19922 case Intrinsic::aarch64_sve_ld4: {
19924 SDValue Chain = N->getOperand(0);
19925 SDValue Mask = N->getOperand(2);
19926 SDValue BasePtr = N->getOperand(3);
19927 SDValue LoadOps[] = {Chain, Mask, BasePtr};
19928 unsigned IntrinsicID =
19929 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19931 LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
19932 return DAG.getMergeValues({Result, Chain}, DL);
19934 case Intrinsic::aarch64_rndr:
19935 case Intrinsic::aarch64_rndrrs: {
19936 unsigned IntrinsicID =
19937 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
19939 (IntrinsicID == Intrinsic::aarch64_rndr ? AArch64SysReg::RNDR
19940 : AArch64SysReg::RNDRRS);
19942 SDValue A = DAG.getNode(
19943 AArch64ISD::MRS, DL, DAG.getVTList(MVT::i64, MVT::Glue, MVT::Other),
19944 N->getOperand(0), DAG.getConstant(Register, DL, MVT::i64));
19945 SDValue B = DAG.getNode(
19946 AArch64ISD::CSINC, DL, MVT::i32, DAG.getConstant(0, DL, MVT::i32),
19947 DAG.getConstant(0, DL, MVT::i32),
19948 DAG.getConstant(AArch64CC::NE, DL, MVT::i32), A.getValue(1));
19949 return DAG.getMergeValues(
19950 {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
19956 case ISD::GlobalAddress:
19957 return performGlobalAddressCombine(N, DAG, Subtarget, getTargetMachine());
19962 // Check if the return value is used as only a return value, as otherwise
19963 // we can't perform a tail-call. In particular, we need to check for
19964 // target ISD nodes that are returns and any other "odd" constructs
19965 // that the generic analysis code won't necessarily catch.
19966 bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
19967 SDValue &Chain) const {
19968 if (N->getNumValues() != 1)
19970 if (!N->hasNUsesOfValue(1, 0))
19973 SDValue TCChain = Chain;
19974 SDNode *Copy = *N->use_begin();
19975 if (Copy->getOpcode() == ISD::CopyToReg) {
19976 // If the copy has a glue operand, we conservatively assume it isn't safe to
19977 // perform a tail call.
19978 if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
19981 TCChain = Copy->getOperand(0);
19982 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
19985 bool HasRet = false;
19986 for (SDNode *Node : Copy->uses()) {
19987 if (Node->getOpcode() != AArch64ISD::RET_FLAG)
19999 // Return whether the an instruction can potentially be optimized to a tail
20000 // call. This will cause the optimizers to attempt to move, or duplicate,
20001 // return instructions to help enable tail call optimizations for this
20003 bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
20004 return CI->isTailCall();
20007 bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
20009 ISD::MemIndexedMode &AM,
20011 SelectionDAG &DAG) const {
20012 if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
20015 Base = Op->getOperand(0);
20016 // All of the indexed addressing mode instructions take a signed
20017 // 9 bit immediate offset.
20018 if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
20019 int64_t RHSC = RHS->getSExtValue();
20020 if (Op->getOpcode() == ISD::SUB)
20021 RHSC = -(uint64_t)RHSC;
20022 if (!isInt<9>(RHSC))
20024 IsInc = (Op->getOpcode() == ISD::ADD);
20025 Offset = Op->getOperand(1);
20031 bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
20033 ISD::MemIndexedMode &AM,
20034 SelectionDAG &DAG) const {
20037 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20038 VT = LD->getMemoryVT();
20039 Ptr = LD->getBasePtr();
20040 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20041 VT = ST->getMemoryVT();
20042 Ptr = ST->getBasePtr();
20047 if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
20049 AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
20053 bool AArch64TargetLowering::getPostIndexedAddressParts(
20054 SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
20055 ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
20058 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20059 VT = LD->getMemoryVT();
20060 Ptr = LD->getBasePtr();
20061 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20062 VT = ST->getMemoryVT();
20063 Ptr = ST->getBasePtr();
20068 if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
20070 // Post-indexing updates the base, so it's not a valid transform
20071 // if that's not the same as the load's pointer.
20074 AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
20078 void AArch64TargetLowering::ReplaceBITCASTResults(
20079 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
20081 SDValue Op = N->getOperand(0);
20082 EVT VT = N->getValueType(0);
20083 EVT SrcVT = Op.getValueType();
20085 if (VT.isScalableVector() && !isTypeLegal(VT) && isTypeLegal(SrcVT)) {
20086 assert(!VT.isFloatingPoint() && SrcVT.isFloatingPoint() &&
20087 "Expected fp->int bitcast!");
20089 // Bitcasting between unpacked vector types of different element counts is
20090 // not a NOP because the live elements are laid out differently.
20092 // e.g. nxv2i32 = XX??XX??
20093 // nxv4f16 = X?X?X?X?
20094 if (VT.getVectorElementCount() != SrcVT.getVectorElementCount())
20097 SDValue CastResult = getSVESafeBitCast(getSVEContainerType(VT), Op, DAG);
20098 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, CastResult));
20102 if (VT != MVT::i16 || (SrcVT != MVT::f16 && SrcVT != MVT::bf16))
20106 DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
20107 DAG.getUNDEF(MVT::i32), Op,
20108 DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
20110 Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
20111 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
20114 static void ReplaceAddWithADDP(SDNode *N, SmallVectorImpl<SDValue> &Results,
20116 const AArch64Subtarget *Subtarget) {
20117 EVT VT = N->getValueType(0);
20118 if (!VT.is256BitVector() ||
20119 (VT.getScalarType().isFloatingPoint() &&
20120 !N->getFlags().hasAllowReassociation()) ||
20121 (VT.getScalarType() == MVT::f16 && !Subtarget->hasFullFP16()))
20124 SDValue X = N->getOperand(0);
20125 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(1));
20127 Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0));
20128 X = N->getOperand(1);
20133 if (Shuf->getOperand(0) != X || !Shuf->getOperand(1)->isUndef())
20136 // Check the mask is 1,0,3,2,5,4,...
20137 ArrayRef<int> Mask = Shuf->getMask();
20138 for (int I = 0, E = Mask.size(); I < E; I++)
20139 if (Mask[I] != (I % 2 == 0 ? I + 1 : I - 1))
20143 auto LoHi = DAG.SplitVector(X, DL);
20144 assert(LoHi.first.getValueType() == LoHi.second.getValueType());
20145 SDValue Addp = DAG.getNode(AArch64ISD::ADDP, N, LoHi.first.getValueType(),
20146 LoHi.first, LoHi.second);
20148 // Shuffle the elements back into order.
20149 SmallVector<int> NMask;
20150 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I < E; I++) {
20151 NMask.push_back(I);
20152 NMask.push_back(I);
20155 DAG.getVectorShuffle(VT, DL,
20156 DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Addp,
20157 DAG.getUNDEF(LoHi.first.getValueType())),
20158 DAG.getUNDEF(VT), NMask));
20161 static void ReplaceReductionResults(SDNode *N,
20162 SmallVectorImpl<SDValue> &Results,
20163 SelectionDAG &DAG, unsigned InterOp,
20164 unsigned AcrossOp) {
20168 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
20169 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
20170 SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
20171 SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
20172 Results.push_back(SplitVal);
20175 static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
20177 SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
20178 SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
20179 DAG.getNode(ISD::SRL, DL, MVT::i128, N,
20180 DAG.getConstant(64, DL, MVT::i64)));
20181 return std::make_pair(Lo, Hi);
20184 void AArch64TargetLowering::ReplaceExtractSubVectorResults(
20185 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
20186 SDValue In = N->getOperand(0);
20187 EVT InVT = In.getValueType();
20189 // Common code will handle these just fine.
20190 if (!InVT.isScalableVector() || !InVT.isInteger())
20194 EVT VT = N->getValueType(0);
20196 // The following checks bail if this is not a halving operation.
20198 ElementCount ResEC = VT.getVectorElementCount();
20200 if (InVT.getVectorElementCount() != (ResEC * 2))
20203 auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
20207 unsigned Index = CIndex->getZExtValue();
20208 if ((Index != 0) && (Index != ResEC.getKnownMinValue()))
20211 unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
20212 EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
20214 SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
20215 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
20218 // Create an even/odd pair of X registers holding integer value V.
20219 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
20220 SDLoc dl(V.getNode());
20221 SDValue VLo = DAG.getAnyExtOrTrunc(V, dl, MVT::i64);
20222 SDValue VHi = DAG.getAnyExtOrTrunc(
20223 DAG.getNode(ISD::SRL, dl, MVT::i128, V, DAG.getConstant(64, dl, MVT::i64)),
20225 if (DAG.getDataLayout().isBigEndian())
20226 std::swap (VLo, VHi);
20228 DAG.getTargetConstant(AArch64::XSeqPairsClassRegClassID, dl, MVT::i32);
20229 SDValue SubReg0 = DAG.getTargetConstant(AArch64::sube64, dl, MVT::i32);
20230 SDValue SubReg1 = DAG.getTargetConstant(AArch64::subo64, dl, MVT::i32);
20231 const SDValue Ops[] = { RegClass, VLo, SubReg0, VHi, SubReg1 };
20233 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
20236 static void ReplaceCMP_SWAP_128Results(SDNode *N,
20237 SmallVectorImpl<SDValue> &Results,
20239 const AArch64Subtarget *Subtarget) {
20240 assert(N->getValueType(0) == MVT::i128 &&
20241 "AtomicCmpSwap on types less than 128 should be legal");
20243 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
20244 if (Subtarget->hasLSE() || Subtarget->outlineAtomics()) {
20245 // LSE has a 128-bit compare and swap (CASP), but i128 is not a legal type,
20246 // so lower it here, wrapped in REG_SEQUENCE and EXTRACT_SUBREG.
20248 createGPRPairNode(DAG, N->getOperand(2)), // Compare value
20249 createGPRPairNode(DAG, N->getOperand(3)), // Store value
20250 N->getOperand(1), // Ptr
20251 N->getOperand(0), // Chain in
20255 switch (MemOp->getMergedOrdering()) {
20256 case AtomicOrdering::Monotonic:
20257 Opcode = AArch64::CASPX;
20259 case AtomicOrdering::Acquire:
20260 Opcode = AArch64::CASPAX;
20262 case AtomicOrdering::Release:
20263 Opcode = AArch64::CASPLX;
20265 case AtomicOrdering::AcquireRelease:
20266 case AtomicOrdering::SequentiallyConsistent:
20267 Opcode = AArch64::CASPALX;
20270 llvm_unreachable("Unexpected ordering!");
20273 MachineSDNode *CmpSwap = DAG.getMachineNode(
20274 Opcode, SDLoc(N), DAG.getVTList(MVT::Untyped, MVT::Other), Ops);
20275 DAG.setNodeMemRefs(CmpSwap, {MemOp});
20277 unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
20278 if (DAG.getDataLayout().isBigEndian())
20279 std::swap(SubReg1, SubReg2);
20280 SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
20281 SDValue(CmpSwap, 0));
20282 SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
20283 SDValue(CmpSwap, 0));
20285 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
20286 Results.push_back(SDValue(CmpSwap, 1)); // Chain out
20291 switch (MemOp->getMergedOrdering()) {
20292 case AtomicOrdering::Monotonic:
20293 Opcode = AArch64::CMP_SWAP_128_MONOTONIC;
20295 case AtomicOrdering::Acquire:
20296 Opcode = AArch64::CMP_SWAP_128_ACQUIRE;
20298 case AtomicOrdering::Release:
20299 Opcode = AArch64::CMP_SWAP_128_RELEASE;
20301 case AtomicOrdering::AcquireRelease:
20302 case AtomicOrdering::SequentiallyConsistent:
20303 Opcode = AArch64::CMP_SWAP_128;
20306 llvm_unreachable("Unexpected ordering!");
20309 auto Desired = splitInt128(N->getOperand(2), DAG);
20310 auto New = splitInt128(N->getOperand(3), DAG);
20311 SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
20312 New.first, New.second, N->getOperand(0)};
20313 SDNode *CmpSwap = DAG.getMachineNode(
20314 Opcode, SDLoc(N), DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other),
20316 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
20318 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
20319 SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
20320 Results.push_back(SDValue(CmpSwap, 3));
20323 void AArch64TargetLowering::ReplaceNodeResults(
20324 SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
20325 switch (N->getOpcode()) {
20327 llvm_unreachable("Don't know how to custom expand this");
20329 ReplaceBITCASTResults(N, Results, DAG);
20331 case ISD::VECREDUCE_ADD:
20332 case ISD::VECREDUCE_SMAX:
20333 case ISD::VECREDUCE_SMIN:
20334 case ISD::VECREDUCE_UMAX:
20335 case ISD::VECREDUCE_UMIN:
20336 Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
20340 ReplaceAddWithADDP(N, Results, DAG, Subtarget);
20345 if (SDValue Result = LowerCTPOP_PARITY(SDValue(N, 0), DAG))
20346 Results.push_back(Result);
20348 case AArch64ISD::SADDV:
20349 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
20351 case AArch64ISD::UADDV:
20352 ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
20354 case AArch64ISD::SMINV:
20355 ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
20357 case AArch64ISD::UMINV:
20358 ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
20360 case AArch64ISD::SMAXV:
20361 ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
20363 case AArch64ISD::UMAXV:
20364 ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
20366 case ISD::FP_TO_UINT:
20367 case ISD::FP_TO_SINT:
20368 case ISD::STRICT_FP_TO_SINT:
20369 case ISD::STRICT_FP_TO_UINT:
20370 assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
20371 // Let normal code take care of it by not adding anything to Results.
20373 case ISD::ATOMIC_CMP_SWAP:
20374 ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
20376 case ISD::ATOMIC_LOAD:
20378 assert(SDValue(N, 0).getValueType() == MVT::i128 &&
20379 "unexpected load's value type");
20380 MemSDNode *LoadNode = cast<MemSDNode>(N);
20381 if ((!LoadNode->isVolatile() && !LoadNode->isAtomic()) ||
20382 LoadNode->getMemoryVT() != MVT::i128) {
20383 // Non-volatile or atomic loads are optimized later in AArch64's load/store
20388 SDValue Result = DAG.getMemIntrinsicNode(
20389 AArch64ISD::LDP, SDLoc(N),
20390 DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
20391 {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
20392 LoadNode->getMemOperand());
20394 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
20395 Result.getValue(0), Result.getValue(1));
20396 Results.append({Pair, Result.getValue(2) /* Chain */});
20399 case ISD::EXTRACT_SUBVECTOR:
20400 ReplaceExtractSubVectorResults(N, Results, DAG);
20402 case ISD::INSERT_SUBVECTOR:
20403 case ISD::CONCAT_VECTORS:
20404 // Custom lowering has been requested for INSERT_SUBVECTOR and
20405 // CONCAT_VECTORS -- but delegate to common code for result type
20408 case ISD::INTRINSIC_WO_CHAIN: {
20409 EVT VT = N->getValueType(0);
20410 assert((VT == MVT::i8 || VT == MVT::i16) &&
20411 "custom lowering for unexpected type");
20413 ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
20414 Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
20418 case Intrinsic::aarch64_sve_clasta_n: {
20420 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
20421 auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
20422 N->getOperand(1), Op2, N->getOperand(3));
20423 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
20426 case Intrinsic::aarch64_sve_clastb_n: {
20428 auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
20429 auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
20430 N->getOperand(1), Op2, N->getOperand(3));
20431 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
20434 case Intrinsic::aarch64_sve_lasta: {
20436 auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
20437 N->getOperand(1), N->getOperand(2));
20438 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
20441 case Intrinsic::aarch64_sve_lastb: {
20443 auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
20444 N->getOperand(1), N->getOperand(2));
20445 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
20453 bool AArch64TargetLowering::useLoadStackGuardNode() const {
20454 if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
20455 return TargetLowering::useLoadStackGuardNode();
20459 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
20460 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
20461 // reciprocal if there are three or more FDIVs.
20465 TargetLoweringBase::LegalizeTypeAction
20466 AArch64TargetLowering::getPreferredVectorAction(MVT VT) const {
20467 // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
20468 // v4i16, v2i32 instead of to promote.
20469 if (VT == MVT::v1i8 || VT == MVT::v1i16 || VT == MVT::v1i32 ||
20471 return TypeWidenVector;
20473 return TargetLoweringBase::getPreferredVectorAction(VT);
20476 // In v8.4a, ldp and stp instructions are guaranteed to be single-copy atomic
20477 // provided the address is 16-byte aligned.
20478 bool AArch64TargetLowering::isOpSuitableForLDPSTP(const Instruction *I) const {
20479 if (!Subtarget->hasLSE2())
20482 if (auto LI = dyn_cast<LoadInst>(I))
20483 return LI->getType()->getPrimitiveSizeInBits() == 128 &&
20484 LI->getAlign() >= Align(16);
20486 if (auto SI = dyn_cast<StoreInst>(I))
20487 return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128 &&
20488 SI->getAlign() >= Align(16);
20493 bool AArch64TargetLowering::shouldInsertFencesForAtomic(
20494 const Instruction *I) const {
20495 return isOpSuitableForLDPSTP(I);
20498 // Loads and stores less than 128-bits are already atomic; ones above that
20499 // are doomed anyway, so defer to the default libcall and blame the OS when
20500 // things go wrong.
20501 TargetLoweringBase::AtomicExpansionKind
20502 AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
20503 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
20504 if (Size != 128 || isOpSuitableForLDPSTP(SI))
20505 return AtomicExpansionKind::None;
20506 return AtomicExpansionKind::Expand;
20509 // Loads and stores less than 128-bits are already atomic; ones above that
20510 // are doomed anyway, so defer to the default libcall and blame the OS when
20511 // things go wrong.
20512 TargetLowering::AtomicExpansionKind
20513 AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
20514 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
20516 if (Size != 128 || isOpSuitableForLDPSTP(LI))
20517 return AtomicExpansionKind::None;
20519 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20520 // implement atomicrmw without spilling. If the target address is also on the
20521 // stack and close enough to the spill slot, this can lead to a situation
20522 // where the monitor always gets cleared and the atomic operation can never
20523 // succeed. So at -O0 lower this operation to a CAS loop.
20524 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
20525 return AtomicExpansionKind::CmpXChg;
20527 return AtomicExpansionKind::LLSC;
20530 // For the real atomic operations, we have ldxr/stxr up to 128 bits,
20531 TargetLowering::AtomicExpansionKind
20532 AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
20533 if (AI->isFloatingPointOperation())
20534 return AtomicExpansionKind::CmpXChg;
20536 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
20537 if (Size > 128) return AtomicExpansionKind::None;
20539 // Nand is not supported in LSE.
20540 // Leave 128 bits to LLSC or CmpXChg.
20541 if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
20542 if (Subtarget->hasLSE())
20543 return AtomicExpansionKind::None;
20544 if (Subtarget->outlineAtomics()) {
20545 // [U]Min/[U]Max RWM atomics are used in __sync_fetch_ libcalls so far.
20546 // Don't outline them unless
20547 // (1) high level <atomic> support approved:
20548 // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p0493r1.pdf
20549 // (2) low level libgcc and compiler-rt support implemented by:
20550 // min/max outline atomics helpers
20551 if (AI->getOperation() != AtomicRMWInst::Min &&
20552 AI->getOperation() != AtomicRMWInst::Max &&
20553 AI->getOperation() != AtomicRMWInst::UMin &&
20554 AI->getOperation() != AtomicRMWInst::UMax) {
20555 return AtomicExpansionKind::None;
20560 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20561 // implement atomicrmw without spilling. If the target address is also on the
20562 // stack and close enough to the spill slot, this can lead to a situation
20563 // where the monitor always gets cleared and the atomic operation can never
20564 // succeed. So at -O0 lower this operation to a CAS loop.
20565 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
20566 return AtomicExpansionKind::CmpXChg;
20568 return AtomicExpansionKind::LLSC;
20571 TargetLowering::AtomicExpansionKind
20572 AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
20573 AtomicCmpXchgInst *AI) const {
20574 // If subtarget has LSE, leave cmpxchg intact for codegen.
20575 if (Subtarget->hasLSE() || Subtarget->outlineAtomics())
20576 return AtomicExpansionKind::None;
20577 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
20578 // implement cmpxchg without spilling. If the address being exchanged is also
20579 // on the stack and close enough to the spill slot, this can lead to a
20580 // situation where the monitor always gets cleared and the atomic operation
20581 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
20582 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
20583 return AtomicExpansionKind::None;
20585 // 128-bit atomic cmpxchg is weird; AtomicExpand doesn't know how to expand
20587 unsigned Size = AI->getCompareOperand()->getType()->getPrimitiveSizeInBits();
20589 return AtomicExpansionKind::None;
20591 return AtomicExpansionKind::LLSC;
20594 Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
20595 Type *ValueTy, Value *Addr,
20596 AtomicOrdering Ord) const {
20597 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20598 bool IsAcquire = isAcquireOrStronger(Ord);
20600 // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
20601 // intrinsic must return {i64, i64} and we have to recombine them into a
20602 // single i128 here.
20603 if (ValueTy->getPrimitiveSizeInBits() == 128) {
20604 Intrinsic::ID Int =
20605 IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
20606 Function *Ldxr = Intrinsic::getDeclaration(M, Int);
20608 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
20609 Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
20611 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
20612 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
20613 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
20614 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
20615 return Builder.CreateOr(
20616 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
20619 Type *Tys[] = { Addr->getType() };
20620 Intrinsic::ID Int =
20621 IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
20622 Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);
20624 const DataLayout &DL = M->getDataLayout();
20625 IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
20626 CallInst *CI = Builder.CreateCall(Ldxr, Addr);
20628 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
20629 Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
20631 return Builder.CreateBitCast(Trunc, ValueTy);
20634 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
20635 IRBuilderBase &Builder) const {
20636 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20637 Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
20640 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
20641 Value *Val, Value *Addr,
20642 AtomicOrdering Ord) const {
20643 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
20644 bool IsRelease = isReleaseOrStronger(Ord);
20646 // Since the intrinsics must have legal type, the i128 intrinsics take two
20647 // parameters: "i64, i64". We must marshal Val into the appropriate form
20648 // before the call.
20649 if (Val->getType()->getPrimitiveSizeInBits() == 128) {
20650 Intrinsic::ID Int =
20651 IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
20652 Function *Stxr = Intrinsic::getDeclaration(M, Int);
20653 Type *Int64Ty = Type::getInt64Ty(M->getContext());
20655 Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
20656 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
20657 Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
20658 return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
20661 Intrinsic::ID Int =
20662 IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
20663 Type *Tys[] = { Addr->getType() };
20664 Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
20666 const DataLayout &DL = M->getDataLayout();
20667 IntegerType *IntValTy = Builder.getIntNTy(DL.getTypeSizeInBits(Val->getType()));
20668 Val = Builder.CreateBitCast(Val, IntValTy);
20670 CallInst *CI = Builder.CreateCall(
20671 Stxr, {Builder.CreateZExtOrBitCast(
20672 Val, Stxr->getFunctionType()->getParamType(0)),
20674 CI->addParamAttr(1, Attribute::get(Builder.getContext(),
20675 Attribute::ElementType, Val->getType()));
20679 bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
20680 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
20681 const DataLayout &DL) const {
20682 if (!Ty->isArrayTy()) {
20683 const TypeSize &TySize = Ty->getPrimitiveSizeInBits();
20684 return TySize.isScalable() && TySize.getKnownMinSize() > 128;
20687 // All non aggregate members of the type must have the same type
20688 SmallVector<EVT> ValueVTs;
20689 ComputeValueVTs(*this, DL, Ty, ValueVTs);
20690 return is_splat(ValueVTs);
20693 bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
20698 static Value *UseTlsOffset(IRBuilderBase &IRB, unsigned Offset) {
20699 Module *M = IRB.GetInsertBlock()->getParent()->getParent();
20700 Function *ThreadPointerFunc =
20701 Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
20702 return IRB.CreatePointerCast(
20703 IRB.CreateConstGEP1_32(IRB.getInt8Ty(), IRB.CreateCall(ThreadPointerFunc),
20705 IRB.getInt8PtrTy()->getPointerTo(0));
20708 Value *AArch64TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const {
20709 // Android provides a fixed TLS slot for the stack cookie. See the definition
20710 // of TLS_SLOT_STACK_GUARD in
20711 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
20712 if (Subtarget->isTargetAndroid())
20713 return UseTlsOffset(IRB, 0x28);
20715 // Fuchsia is similar.
20716 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
20717 if (Subtarget->isTargetFuchsia())
20718 return UseTlsOffset(IRB, -0x10);
20720 return TargetLowering::getIRStackGuard(IRB);
20723 void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
20724 // MSVC CRT provides functionalities for stack protection.
20725 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment()) {
20726 // MSVC CRT has a global variable holding security cookie.
20727 M.getOrInsertGlobal("__security_cookie",
20728 Type::getInt8PtrTy(M.getContext()));
20730 // MSVC CRT has a function to validate security cookie.
20731 FunctionCallee SecurityCheckCookie = M.getOrInsertFunction(
20732 "__security_check_cookie", Type::getVoidTy(M.getContext()),
20733 Type::getInt8PtrTy(M.getContext()));
20734 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) {
20735 F->setCallingConv(CallingConv::Win64);
20736 F->addParamAttr(0, Attribute::AttrKind::InReg);
20740 TargetLowering::insertSSPDeclarations(M);
20743 Value *AArch64TargetLowering::getSDagStackGuard(const Module &M) const {
20744 // MSVC CRT has a global variable holding security cookie.
20745 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20746 return M.getGlobalVariable("__security_cookie");
20747 return TargetLowering::getSDagStackGuard(M);
20750 Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
20751 // MSVC CRT has a function to validate security cookie.
20752 if (Subtarget->getTargetTriple().isWindowsMSVCEnvironment())
20753 return M.getFunction("__security_check_cookie");
20754 return TargetLowering::getSSPStackGuardCheck(M);
20758 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
20759 // Android provides a fixed TLS slot for the SafeStack pointer. See the
20760 // definition of TLS_SLOT_SAFESTACK in
20761 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
20762 if (Subtarget->isTargetAndroid())
20763 return UseTlsOffset(IRB, 0x48);
20765 // Fuchsia is similar.
20766 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
20767 if (Subtarget->isTargetFuchsia())
20768 return UseTlsOffset(IRB, -0x8);
20770 return TargetLowering::getSafeStackPointerLocation(IRB);
20773 bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
20774 const Instruction &AndI) const {
20775 // Only sink 'and' mask to cmp use block if it is masking a single bit, since
20776 // this is likely to be fold the and/cmp/br into a single tbz instruction. It
20777 // may be beneficial to sink in other cases, but we would have to check that
20778 // the cmp would not get folded into the br to form a cbz for these to be
20780 ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
20783 return Mask->getValue().isPowerOf2();
20786 bool AArch64TargetLowering::
20787 shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
20788 SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y,
20789 unsigned OldShiftOpcode, unsigned NewShiftOpcode,
20790 SelectionDAG &DAG) const {
20791 // Does baseline recommend not to perform the fold by default?
20792 if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
20793 X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG))
20795 // Else, if this is a vector shift, prefer 'shl'.
20796 return X.getValueType().isScalarInteger() || NewShiftOpcode == ISD::SHL;
20799 bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
20801 if (DAG.getMachineFunction().getFunction().hasMinSize() &&
20802 !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
20807 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
20808 // Update IsSplitCSR in AArch64unctionInfo.
20809 AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
20810 AFI->setIsSplitCSR(true);
20813 void AArch64TargetLowering::insertCopiesSplitCSR(
20814 MachineBasicBlock *Entry,
20815 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
20816 const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
20817 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
20821 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
20822 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
20823 MachineBasicBlock::iterator MBBI = Entry->begin();
20824 for (const MCPhysReg *I = IStart; *I; ++I) {
20825 const TargetRegisterClass *RC = nullptr;
20826 if (AArch64::GPR64RegClass.contains(*I))
20827 RC = &AArch64::GPR64RegClass;
20828 else if (AArch64::FPR64RegClass.contains(*I))
20829 RC = &AArch64::FPR64RegClass;
20831 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
20833 Register NewVR = MRI->createVirtualRegister(RC);
20834 // Create copy from CSR to a virtual register.
20835 // FIXME: this currently does not emit CFI pseudo-instructions, it works
20836 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
20837 // nounwind. If we want to generalize this later, we may need to emit
20838 // CFI pseudo-instructions.
20839 assert(Entry->getParent()->getFunction().hasFnAttribute(
20840 Attribute::NoUnwind) &&
20841 "Function should be nounwind in insertCopiesSplitCSR!");
20842 Entry->addLiveIn(*I);
20843 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
20846 // Insert the copy-back instructions right before the terminator.
20847 for (auto *Exit : Exits)
20848 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
20849 TII->get(TargetOpcode::COPY), *I)
20854 bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
20855 // Integer division on AArch64 is expensive. However, when aggressively
20856 // optimizing for code size, we prefer to use a div instruction, as it is
20857 // usually smaller than the alternative sequence.
20858 // The exception to this is vector division. Since AArch64 doesn't have vector
20859 // integer division, leaving the division as-is is a loss even in terms of
20860 // size, because it will have to be scalarized, while the alternative code
20861 // sequence can be performed in vector form.
20862 bool OptSize = Attr.hasFnAttr(Attribute::MinSize);
20863 return OptSize && !VT.isVector();
20866 bool AArch64TargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
20867 // We want inc-of-add for scalars and sub-of-not for vectors.
20868 return VT.isScalarInteger();
20871 bool AArch64TargetLowering::shouldConvertFpToSat(unsigned Op, EVT FPVT,
20873 // v8f16 without fp16 need to be extended to v8f32, which is more difficult to
20875 if (FPVT == MVT::v8f16 && !Subtarget->hasFullFP16())
20877 return TargetLowering::shouldConvertFpToSat(Op, FPVT, VT);
20880 bool AArch64TargetLowering::enableAggressiveFMAFusion(EVT VT) const {
20881 return Subtarget->hasAggressiveFMA() && VT.isFloatingPoint();
20885 AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
20886 if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows())
20887 return getPointerTy(DL).getSizeInBits();
20889 return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
20892 void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
20893 MachineFrameInfo &MFI = MF.getFrameInfo();
20894 // If we have any vulnerable SVE stack objects then the stack protector
20895 // needs to be placed at the top of the SVE stack area, as the SVE locals
20896 // are placed above the other locals, so we allocate it as if it were a
20897 // scalable vector.
20898 // FIXME: It may be worthwhile having a specific interface for this rather
20899 // than doing it here in finalizeLowering.
20900 if (MFI.hasStackProtectorIndex()) {
20901 for (unsigned int i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
20902 if (MFI.getStackID(i) == TargetStackID::ScalableVector &&
20903 MFI.getObjectSSPLayout(i) != MachineFrameInfo::SSPLK_None) {
20904 MFI.setStackID(MFI.getStackProtectorIndex(),
20905 TargetStackID::ScalableVector);
20906 MFI.setObjectAlignment(MFI.getStackProtectorIndex(), Align(16));
20911 MFI.computeMaxCallFrameSize(MF);
20912 TargetLoweringBase::finalizeLowering(MF);
20915 // Unlike X86, we let frame lowering assign offsets to all catch objects.
20916 bool AArch64TargetLowering::needsFixedCatchObjects() const {
20920 bool AArch64TargetLowering::shouldLocalize(
20921 const MachineInstr &MI, const TargetTransformInfo *TTI) const {
20922 auto &MF = *MI.getMF();
20923 auto &MRI = MF.getRegInfo();
20924 auto maxUses = [](unsigned RematCost) {
20925 // A cost of 1 means remats are basically free.
20926 if (RematCost == 1)
20927 return std::numeric_limits<unsigned>::max();
20928 if (RematCost == 2)
20931 // Remat is too expensive, only sink if there's one user.
20934 llvm_unreachable("Unexpected remat cost");
20937 switch (MI.getOpcode()) {
20938 case TargetOpcode::G_GLOBAL_VALUE: {
20939 // On Darwin, TLS global vars get selected into function calls, which
20940 // we don't want localized, as they can get moved into the middle of a
20941 // another call sequence.
20942 const GlobalValue &GV = *MI.getOperand(1).getGlobal();
20943 if (GV.isThreadLocal() && Subtarget->isTargetMachO())
20947 case TargetOpcode::G_CONSTANT: {
20948 auto *CI = MI.getOperand(1).getCImm();
20949 APInt Imm = CI->getValue();
20950 InstructionCost Cost = TTI->getIntImmCost(
20951 Imm, CI->getType(), TargetTransformInfo::TCK_CodeSize);
20952 assert(Cost.isValid() && "Expected a valid imm cost");
20954 unsigned RematCost = *Cost.getValue();
20955 Register Reg = MI.getOperand(0).getReg();
20956 unsigned MaxUses = maxUses(RematCost);
20957 // Don't pass UINT_MAX sentinal value to hasAtMostUserInstrs().
20958 if (MaxUses == std::numeric_limits<unsigned>::max())
20960 return MRI.hasAtMostUserInstrs(Reg, MaxUses);
20962 // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
20964 case AArch64::ADRP:
20965 case AArch64::G_ADD_LOW:
20970 return TargetLoweringBase::shouldLocalize(MI, TTI);
20973 bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
20974 if (isa<ScalableVectorType>(Inst.getType()))
20977 for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
20978 if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
20981 if (const AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
20982 if (isa<ScalableVectorType>(AI->getAllocatedType()))
20989 // Return the largest legal scalable vector type that matches VT's element type.
20990 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
20991 assert(VT.isFixedLengthVector() &&
20992 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
20993 "Expected legal fixed length vector!");
20994 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
20996 llvm_unreachable("unexpected element type for SVE container");
20998 return EVT(MVT::nxv16i8);
21000 return EVT(MVT::nxv8i16);
21002 return EVT(MVT::nxv4i32);
21004 return EVT(MVT::nxv2i64);
21006 return EVT(MVT::nxv8f16);
21008 return EVT(MVT::nxv4f32);
21010 return EVT(MVT::nxv2f64);
21014 // Return a PTRUE with active lanes corresponding to the extent of VT.
21015 static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
21017 assert(VT.isFixedLengthVector() &&
21018 DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21019 "Expected legal fixed length vector!");
21021 Optional<unsigned> PgPattern =
21022 getSVEPredPatternFromNumElements(VT.getVectorNumElements());
21023 assert(PgPattern && "Unexpected element count for SVE predicate");
21025 // For vectors that are exactly getMaxSVEVectorSizeInBits big, we can use
21026 // AArch64SVEPredPattern::all, which can enable the use of unpredicated
21027 // variants of instructions when available.
21028 const auto &Subtarget = DAG.getSubtarget<AArch64Subtarget>();
21029 unsigned MinSVESize = Subtarget.getMinSVEVectorSizeInBits();
21030 unsigned MaxSVESize = Subtarget.getMaxSVEVectorSizeInBits();
21031 if (MaxSVESize && MinSVESize == MaxSVESize &&
21032 MaxSVESize == VT.getSizeInBits())
21033 PgPattern = AArch64SVEPredPattern::all;
21036 switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
21038 llvm_unreachable("unexpected element type for SVE predicate");
21040 MaskVT = MVT::nxv16i1;
21044 MaskVT = MVT::nxv8i1;
21048 MaskVT = MVT::nxv4i1;
21052 MaskVT = MVT::nxv2i1;
21056 return getPTrue(DAG, DL, MaskVT, *PgPattern);
21059 static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
21061 assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
21062 "Expected legal scalable vector!");
21063 auto PredTy = VT.changeVectorElementType(MVT::i1);
21064 return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
21067 static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
21068 if (VT.isFixedLengthVector())
21069 return getPredicateForFixedLengthVector(DAG, DL, VT);
21071 return getPredicateForScalableVector(DAG, DL, VT);
21074 // Grow V to consume an entire SVE register.
21075 static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
21076 assert(VT.isScalableVector() &&
21077 "Expected to convert into a scalable vector!");
21078 assert(V.getValueType().isFixedLengthVector() &&
21079 "Expected a fixed length vector operand!");
21081 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21082 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
21085 // Shrink V so it's just big enough to maintain a VT's worth of data.
21086 static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
21087 assert(VT.isFixedLengthVector() &&
21088 "Expected to convert into a fixed length vector!");
21089 assert(V.getValueType().isScalableVector() &&
21090 "Expected a scalable vector operand!");
21092 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21093 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
21096 // Convert all fixed length vector loads larger than NEON to masked_loads.
21097 SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
21098 SDValue Op, SelectionDAG &DAG) const {
21099 auto Load = cast<LoadSDNode>(Op);
21102 EVT VT = Op.getValueType();
21103 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21104 EVT LoadVT = ContainerVT;
21105 EVT MemVT = Load->getMemoryVT();
21107 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
21109 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
21110 LoadVT = ContainerVT.changeTypeToInteger();
21111 MemVT = MemVT.changeTypeToInteger();
21114 SDValue NewLoad = DAG.getMaskedLoad(
21115 LoadVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(), Pg,
21116 DAG.getUNDEF(LoadVT), MemVT, Load->getMemOperand(),
21117 Load->getAddressingMode(), Load->getExtensionType());
21119 SDValue Result = NewLoad;
21120 if (VT.isFloatingPoint() && Load->getExtensionType() == ISD::EXTLOAD) {
21121 EVT ExtendVT = ContainerVT.changeVectorElementType(
21122 Load->getMemoryVT().getVectorElementType());
21124 Result = getSVESafeBitCast(ExtendVT, Result, DAG);
21125 Result = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
21126 Pg, Result, DAG.getUNDEF(ContainerVT));
21129 Result = convertFromScalableVector(DAG, VT, Result);
21130 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
21131 return DAG.getMergeValues(MergedValues, DL);
21134 static SDValue convertFixedMaskToScalableVector(SDValue Mask,
21135 SelectionDAG &DAG) {
21137 EVT InVT = Mask.getValueType();
21138 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
21140 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
21142 if (ISD::isBuildVectorAllOnes(Mask.getNode()))
21145 auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask);
21146 auto Op2 = DAG.getConstant(0, DL, ContainerVT);
21148 return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(),
21149 {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)});
21152 // Convert all fixed length vector loads larger than NEON to masked_loads.
21153 SDValue AArch64TargetLowering::LowerFixedLengthVectorMLoadToSVE(
21154 SDValue Op, SelectionDAG &DAG) const {
21155 auto Load = cast<MaskedLoadSDNode>(Op);
21158 EVT VT = Op.getValueType();
21159 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21161 SDValue Mask = convertFixedMaskToScalableVector(Load->getMask(), DAG);
21164 bool IsPassThruZeroOrUndef = false;
21166 if (Load->getPassThru()->isUndef()) {
21167 PassThru = DAG.getUNDEF(ContainerVT);
21168 IsPassThruZeroOrUndef = true;
21170 if (ContainerVT.isInteger())
21171 PassThru = DAG.getConstant(0, DL, ContainerVT);
21173 PassThru = DAG.getConstantFP(0, DL, ContainerVT);
21174 if (isZerosVector(Load->getPassThru().getNode()))
21175 IsPassThruZeroOrUndef = true;
21178 SDValue NewLoad = DAG.getMaskedLoad(
21179 ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
21180 Mask, PassThru, Load->getMemoryVT(), Load->getMemOperand(),
21181 Load->getAddressingMode(), Load->getExtensionType());
21183 SDValue Result = NewLoad;
21184 if (!IsPassThruZeroOrUndef) {
21185 SDValue OldPassThru =
21186 convertToScalableVector(DAG, ContainerVT, Load->getPassThru());
21187 Result = DAG.getSelect(DL, ContainerVT, Mask, Result, OldPassThru);
21190 Result = convertFromScalableVector(DAG, VT, Result);
21191 SDValue MergedValues[2] = {Result, NewLoad.getValue(1)};
21192 return DAG.getMergeValues(MergedValues, DL);
21195 // Convert all fixed length vector stores larger than NEON to masked_stores.
21196 SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
21197 SDValue Op, SelectionDAG &DAG) const {
21198 auto Store = cast<StoreSDNode>(Op);
21201 EVT VT = Store->getValue().getValueType();
21202 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21203 EVT MemVT = Store->getMemoryVT();
21205 auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
21206 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
21208 if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
21209 EVT TruncVT = ContainerVT.changeVectorElementType(
21210 Store->getMemoryVT().getVectorElementType());
21211 MemVT = MemVT.changeTypeToInteger();
21212 NewValue = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, TruncVT, Pg,
21213 NewValue, DAG.getTargetConstant(0, DL, MVT::i64),
21214 DAG.getUNDEF(TruncVT));
21216 getSVESafeBitCast(ContainerVT.changeTypeToInteger(), NewValue, DAG);
21219 return DAG.getMaskedStore(Store->getChain(), DL, NewValue,
21220 Store->getBasePtr(), Store->getOffset(), Pg, MemVT,
21221 Store->getMemOperand(), Store->getAddressingMode(),
21222 Store->isTruncatingStore());
21225 SDValue AArch64TargetLowering::LowerFixedLengthVectorMStoreToSVE(
21226 SDValue Op, SelectionDAG &DAG) const {
21227 auto *Store = cast<MaskedStoreSDNode>(Op);
21230 EVT VT = Store->getValue().getValueType();
21231 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21233 auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
21234 SDValue Mask = convertFixedMaskToScalableVector(Store->getMask(), DAG);
21236 return DAG.getMaskedStore(
21237 Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
21238 Mask, Store->getMemoryVT(), Store->getMemOperand(),
21239 Store->getAddressingMode(), Store->isTruncatingStore());
21242 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntDivideToSVE(
21243 SDValue Op, SelectionDAG &DAG) const {
21245 EVT VT = Op.getValueType();
21246 EVT EltVT = VT.getVectorElementType();
21248 bool Signed = Op.getOpcode() == ISD::SDIV;
21249 unsigned PredOpcode = Signed ? AArch64ISD::SDIV_PRED : AArch64ISD::UDIV_PRED;
21253 if (Signed && isPow2Splat(Op.getOperand(1), SplatVal, Negated)) {
21254 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21255 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
21256 SDValue Op2 = DAG.getTargetConstant(Log2_64(SplatVal), dl, MVT::i32);
21258 SDValue Pg = getPredicateForFixedLengthVector(DAG, dl, VT);
21259 SDValue Res = DAG.getNode(AArch64ISD::SRAD_MERGE_OP1, dl, ContainerVT, Pg, Op1, Op2);
21261 Res = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, dl, VT), Res);
21263 return convertFromScalableVector(DAG, VT, Res);
21266 // Scalable vector i32/i64 DIV is supported.
21267 if (EltVT == MVT::i32 || EltVT == MVT::i64)
21268 return LowerToPredicatedOp(Op, DAG, PredOpcode);
21270 // Scalable vector i8/i16 DIV is not supported. Promote it to i32.
21271 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21272 EVT HalfVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
21273 EVT FixedWidenedVT = HalfVT.widenIntegerVectorElementType(*DAG.getContext());
21274 EVT ScalableWidenedVT = getContainerForFixedLengthVector(DAG, FixedWidenedVT);
21276 // If this is not a full vector, extend, div, and truncate it.
21277 EVT WidenedVT = VT.widenIntegerVectorElementType(*DAG.getContext());
21278 if (DAG.getTargetLoweringInfo().isTypeLegal(WidenedVT)) {
21279 unsigned ExtendOpcode = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21280 SDValue Op0 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(0));
21281 SDValue Op1 = DAG.getNode(ExtendOpcode, dl, WidenedVT, Op.getOperand(1));
21282 SDValue Div = DAG.getNode(Op.getOpcode(), dl, WidenedVT, Op0, Op1);
21283 return DAG.getNode(ISD::TRUNCATE, dl, VT, Div);
21286 // Convert the operands to scalable vectors.
21287 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
21288 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
21290 // Extend the scalable operands.
21291 unsigned UnpkLo = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
21292 unsigned UnpkHi = Signed ? AArch64ISD::SUNPKHI : AArch64ISD::UUNPKHI;
21293 SDValue Op0Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op0);
21294 SDValue Op1Lo = DAG.getNode(UnpkLo, dl, ScalableWidenedVT, Op1);
21295 SDValue Op0Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op0);
21296 SDValue Op1Hi = DAG.getNode(UnpkHi, dl, ScalableWidenedVT, Op1);
21298 // Convert back to fixed vectors so the DIV can be further lowered.
21299 Op0Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op0Lo);
21300 Op1Lo = convertFromScalableVector(DAG, FixedWidenedVT, Op1Lo);
21301 Op0Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op0Hi);
21302 Op1Hi = convertFromScalableVector(DAG, FixedWidenedVT, Op1Hi);
21303 SDValue ResultLo = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
21305 SDValue ResultHi = DAG.getNode(Op.getOpcode(), dl, FixedWidenedVT,
21308 // Convert again to scalable vectors to truncate.
21309 ResultLo = convertToScalableVector(DAG, ScalableWidenedVT, ResultLo);
21310 ResultHi = convertToScalableVector(DAG, ScalableWidenedVT, ResultHi);
21311 SDValue ScalableResult = DAG.getNode(AArch64ISD::UZP1, dl, ContainerVT,
21312 ResultLo, ResultHi);
21314 return convertFromScalableVector(DAG, VT, ScalableResult);
21317 SDValue AArch64TargetLowering::LowerFixedLengthVectorIntExtendToSVE(
21318 SDValue Op, SelectionDAG &DAG) const {
21319 EVT VT = Op.getValueType();
21320 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21323 SDValue Val = Op.getOperand(0);
21324 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
21325 Val = convertToScalableVector(DAG, ContainerVT, Val);
21327 bool Signed = Op.getOpcode() == ISD::SIGN_EXTEND;
21328 unsigned ExtendOpc = Signed ? AArch64ISD::SUNPKLO : AArch64ISD::UUNPKLO;
21330 // Repeatedly unpack Val until the result is of the desired element type.
21331 switch (ContainerVT.getSimpleVT().SimpleTy) {
21333 llvm_unreachable("unimplemented container type");
21335 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv8i16, Val);
21336 if (VT.getVectorElementType() == MVT::i16)
21340 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv4i32, Val);
21341 if (VT.getVectorElementType() == MVT::i32)
21345 Val = DAG.getNode(ExtendOpc, DL, MVT::nxv2i64, Val);
21346 assert(VT.getVectorElementType() == MVT::i64 && "Unexpected element type!");
21350 return convertFromScalableVector(DAG, VT, Val);
21353 SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
21354 SDValue Op, SelectionDAG &DAG) const {
21355 EVT VT = Op.getValueType();
21356 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21359 SDValue Val = Op.getOperand(0);
21360 EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
21361 Val = convertToScalableVector(DAG, ContainerVT, Val);
21363 // Repeatedly truncate Val until the result is of the desired element type.
21364 switch (ContainerVT.getSimpleVT().SimpleTy) {
21366 llvm_unreachable("unimplemented container type");
21368 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
21369 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
21370 if (VT.getVectorElementType() == MVT::i32)
21374 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
21375 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
21376 if (VT.getVectorElementType() == MVT::i16)
21380 Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
21381 Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
21382 assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
21386 return convertFromScalableVector(DAG, VT, Val);
21389 SDValue AArch64TargetLowering::LowerFixedLengthExtractVectorElt(
21390 SDValue Op, SelectionDAG &DAG) const {
21391 EVT VT = Op.getValueType();
21392 EVT InVT = Op.getOperand(0).getValueType();
21393 assert(InVT.isFixedLengthVector() && "Expected fixed length vector type!");
21396 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
21397 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
21399 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Op.getOperand(1));
21402 SDValue AArch64TargetLowering::LowerFixedLengthInsertVectorElt(
21403 SDValue Op, SelectionDAG &DAG) const {
21404 EVT VT = Op.getValueType();
21405 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21408 EVT InVT = Op.getOperand(0).getValueType();
21409 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
21410 SDValue Op0 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(0));
21412 auto ScalableRes = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT, Op0,
21413 Op.getOperand(1), Op.getOperand(2));
21415 return convertFromScalableVector(DAG, VT, ScalableRes);
21418 // Convert vector operation 'Op' to an equivalent predicated operation whereby
21419 // the original operation's type is used to construct a suitable predicate.
21420 // NOTE: The results for inactive lanes are undefined.
21421 SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
21423 unsigned NewOp) const {
21424 EVT VT = Op.getValueType();
21426 auto Pg = getPredicateForVector(DAG, DL, VT);
21428 if (VT.isFixedLengthVector()) {
21429 assert(isTypeLegal(VT) && "Expected only legal fixed-width types");
21430 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21432 // Create list of operands by converting existing ones to scalable types.
21433 SmallVector<SDValue, 4> Operands = {Pg};
21434 for (const SDValue &V : Op->op_values()) {
21435 if (isa<CondCodeSDNode>(V)) {
21436 Operands.push_back(V);
21440 if (const VTSDNode *VTNode = dyn_cast<VTSDNode>(V)) {
21441 EVT VTArg = VTNode->getVT().getVectorElementType();
21442 EVT NewVTArg = ContainerVT.changeVectorElementType(VTArg);
21443 Operands.push_back(DAG.getValueType(NewVTArg));
21447 assert(isTypeLegal(V.getValueType()) &&
21448 "Expected only legal fixed-width types");
21449 Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
21452 if (isMergePassthruOpcode(NewOp))
21453 Operands.push_back(DAG.getUNDEF(ContainerVT));
21455 auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
21456 return convertFromScalableVector(DAG, VT, ScalableRes);
21459 assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
21461 SmallVector<SDValue, 4> Operands = {Pg};
21462 for (const SDValue &V : Op->op_values()) {
21463 assert((!V.getValueType().isVector() ||
21464 V.getValueType().isScalableVector()) &&
21465 "Only scalable vectors are supported!");
21466 Operands.push_back(V);
21469 if (isMergePassthruOpcode(NewOp))
21470 Operands.push_back(DAG.getUNDEF(VT));
21472 return DAG.getNode(NewOp, DL, VT, Operands, Op->getFlags());
21475 // If a fixed length vector operation has no side effects when applied to
21476 // undefined elements, we can safely use scalable vectors to perform the same
21477 // operation without needing to worry about predication.
21478 SDValue AArch64TargetLowering::LowerToScalableOp(SDValue Op,
21479 SelectionDAG &DAG) const {
21480 EVT VT = Op.getValueType();
21481 assert(useSVEForFixedLengthVectorVT(VT) &&
21482 "Only expected to lower fixed length vector operation!");
21483 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21485 // Create list of operands by converting existing ones to scalable types.
21486 SmallVector<SDValue, 4> Ops;
21487 for (const SDValue &V : Op->op_values()) {
21488 assert(!isa<VTSDNode>(V) && "Unexpected VTSDNode node!");
21490 // Pass through non-vector operands.
21491 if (!V.getValueType().isVector()) {
21496 // "cast" fixed length vector to a scalable vector.
21497 assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
21498 "Only fixed length vectors are supported!");
21499 Ops.push_back(convertToScalableVector(DAG, ContainerVT, V));
21502 auto ScalableRes = DAG.getNode(Op.getOpcode(), SDLoc(Op), ContainerVT, Ops);
21503 return convertFromScalableVector(DAG, VT, ScalableRes);
21506 SDValue AArch64TargetLowering::LowerVECREDUCE_SEQ_FADD(SDValue ScalarOp,
21507 SelectionDAG &DAG) const {
21508 SDLoc DL(ScalarOp);
21509 SDValue AccOp = ScalarOp.getOperand(0);
21510 SDValue VecOp = ScalarOp.getOperand(1);
21511 EVT SrcVT = VecOp.getValueType();
21512 EVT ResVT = SrcVT.getVectorElementType();
21514 EVT ContainerVT = SrcVT;
21515 if (SrcVT.isFixedLengthVector()) {
21516 ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
21517 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
21520 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
21521 SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
21523 // Convert operands to Scalable.
21524 AccOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ContainerVT,
21525 DAG.getUNDEF(ContainerVT), AccOp, Zero);
21527 // Perform reduction.
21528 SDValue Rdx = DAG.getNode(AArch64ISD::FADDA_PRED, DL, ContainerVT,
21531 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Rdx, Zero);
21534 SDValue AArch64TargetLowering::LowerPredReductionToSVE(SDValue ReduceOp,
21535 SelectionDAG &DAG) const {
21536 SDLoc DL(ReduceOp);
21537 SDValue Op = ReduceOp.getOperand(0);
21538 EVT OpVT = Op.getValueType();
21539 EVT VT = ReduceOp.getValueType();
21541 if (!OpVT.isScalableVector() || OpVT.getVectorElementType() != MVT::i1)
21544 SDValue Pg = getPredicateForVector(DAG, DL, OpVT);
21546 switch (ReduceOp.getOpcode()) {
21549 case ISD::VECREDUCE_OR:
21550 if (isAllActivePredicate(DAG, Pg) && OpVT == MVT::nxv16i1)
21551 // The predicate can be 'Op' because
21552 // vecreduce_or(Op & <all true>) <=> vecreduce_or(Op).
21553 return getPTest(DAG, VT, Op, Op, AArch64CC::ANY_ACTIVE);
21555 return getPTest(DAG, VT, Pg, Op, AArch64CC::ANY_ACTIVE);
21556 case ISD::VECREDUCE_AND: {
21557 Op = DAG.getNode(ISD::XOR, DL, OpVT, Op, Pg);
21558 return getPTest(DAG, VT, Pg, Op, AArch64CC::NONE_ACTIVE);
21560 case ISD::VECREDUCE_XOR: {
21562 DAG.getTargetConstant(Intrinsic::aarch64_sve_cntp, DL, MVT::i64);
21563 if (OpVT == MVT::nxv1i1) {
21564 // Emulate a CNTP on .Q using .D and a different governing predicate.
21565 Pg = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Pg);
21566 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, MVT::nxv2i1, Op);
21569 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i64, ID, Pg, Op);
21570 return DAG.getAnyExtOrTrunc(Cntp, DL, VT);
21577 SDValue AArch64TargetLowering::LowerReductionToSVE(unsigned Opcode,
21579 SelectionDAG &DAG) const {
21580 SDLoc DL(ScalarOp);
21581 SDValue VecOp = ScalarOp.getOperand(0);
21582 EVT SrcVT = VecOp.getValueType();
21584 if (useSVEForFixedLengthVectorVT(
21586 /*OverrideNEON=*/Subtarget->useSVEForFixedLengthVectors())) {
21587 EVT ContainerVT = getContainerForFixedLengthVector(DAG, SrcVT);
21588 VecOp = convertToScalableVector(DAG, ContainerVT, VecOp);
21591 // UADDV always returns an i64 result.
21592 EVT ResVT = (Opcode == AArch64ISD::UADDV_PRED) ? MVT::i64 :
21593 SrcVT.getVectorElementType();
21595 if (SrcVT.isFixedLengthVector() || Opcode == AArch64ISD::UADDV_PRED)
21596 RdxVT = getPackedSVEVectorVT(ResVT);
21598 SDValue Pg = getPredicateForVector(DAG, DL, SrcVT);
21599 SDValue Rdx = DAG.getNode(Opcode, DL, RdxVT, Pg, VecOp);
21600 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT,
21601 Rdx, DAG.getConstant(0, DL, MVT::i64));
21603 // The VEC_REDUCE nodes expect an element size result.
21604 if (ResVT != ScalarOp.getValueType())
21605 Res = DAG.getAnyExtOrTrunc(Res, DL, ScalarOp.getValueType());
21611 AArch64TargetLowering::LowerFixedLengthVectorSelectToSVE(SDValue Op,
21612 SelectionDAG &DAG) const {
21613 EVT VT = Op.getValueType();
21616 EVT InVT = Op.getOperand(1).getValueType();
21617 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
21618 SDValue Op1 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(1));
21619 SDValue Op2 = convertToScalableVector(DAG, ContainerVT, Op->getOperand(2));
21621 // Convert the mask to a predicated (NOTE: We don't need to worry about
21622 // inactive lanes since VSELECT is safe when given undefined elements).
21623 EVT MaskVT = Op.getOperand(0).getValueType();
21624 EVT MaskContainerVT = getContainerForFixedLengthVector(DAG, MaskVT);
21625 auto Mask = convertToScalableVector(DAG, MaskContainerVT, Op.getOperand(0));
21626 Mask = DAG.getNode(ISD::TRUNCATE, DL,
21627 MaskContainerVT.changeVectorElementType(MVT::i1), Mask);
21629 auto ScalableRes = DAG.getNode(ISD::VSELECT, DL, ContainerVT,
21632 return convertFromScalableVector(DAG, VT, ScalableRes);
21635 SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
21636 SDValue Op, SelectionDAG &DAG) const {
21638 EVT InVT = Op.getOperand(0).getValueType();
21639 EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
21641 assert(useSVEForFixedLengthVectorVT(InVT) &&
21642 "Only expected to lower fixed length vector operation!");
21643 assert(Op.getValueType() == InVT.changeTypeToInteger() &&
21644 "Expected integer result of the same bit length as the inputs!");
21646 auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0));
21647 auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1));
21648 auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT);
21650 EVT CmpVT = Pg.getValueType();
21651 auto Cmp = DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT,
21652 {Pg, Op1, Op2, Op.getOperand(2)});
21654 EVT PromoteVT = ContainerVT.changeTypeToInteger();
21655 auto Promote = DAG.getBoolExtOrTrunc(Cmp, DL, PromoteVT, InVT);
21656 return convertFromScalableVector(DAG, Op.getValueType(), Promote);
21660 AArch64TargetLowering::LowerFixedLengthBitcastToSVE(SDValue Op,
21661 SelectionDAG &DAG) const {
21663 auto SrcOp = Op.getOperand(0);
21664 EVT VT = Op.getValueType();
21665 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
21666 EVT ContainerSrcVT =
21667 getContainerForFixedLengthVector(DAG, SrcOp.getValueType());
21669 SrcOp = convertToScalableVector(DAG, ContainerSrcVT, SrcOp);
21670 Op = DAG.getNode(ISD::BITCAST, DL, ContainerDstVT, SrcOp);
21671 return convertFromScalableVector(DAG, VT, Op);
21674 SDValue AArch64TargetLowering::LowerFixedLengthConcatVectorsToSVE(
21675 SDValue Op, SelectionDAG &DAG) const {
21677 unsigned NumOperands = Op->getNumOperands();
21679 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
21680 "Unexpected number of operands in CONCAT_VECTORS");
21682 auto SrcOp1 = Op.getOperand(0);
21683 auto SrcOp2 = Op.getOperand(1);
21684 EVT VT = Op.getValueType();
21685 EVT SrcVT = SrcOp1.getValueType();
21687 if (NumOperands > 2) {
21688 SmallVector<SDValue, 4> Ops;
21689 EVT PairVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext());
21690 for (unsigned I = 0; I < NumOperands; I += 2)
21691 Ops.push_back(DAG.getNode(ISD::CONCAT_VECTORS, DL, PairVT,
21692 Op->getOperand(I), Op->getOperand(I + 1)));
21694 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
21697 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21699 SDValue Pg = getPredicateForFixedLengthVector(DAG, DL, SrcVT);
21700 SrcOp1 = convertToScalableVector(DAG, ContainerVT, SrcOp1);
21701 SrcOp2 = convertToScalableVector(DAG, ContainerVT, SrcOp2);
21703 Op = DAG.getNode(AArch64ISD::SPLICE, DL, ContainerVT, Pg, SrcOp1, SrcOp2);
21705 return convertFromScalableVector(DAG, VT, Op);
21709 AArch64TargetLowering::LowerFixedLengthFPExtendToSVE(SDValue Op,
21710 SelectionDAG &DAG) const {
21711 EVT VT = Op.getValueType();
21712 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21715 SDValue Val = Op.getOperand(0);
21716 SDValue Pg = getPredicateForVector(DAG, DL, VT);
21717 EVT SrcVT = Val.getValueType();
21718 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21719 EVT ExtendVT = ContainerVT.changeVectorElementType(
21720 SrcVT.getVectorElementType());
21722 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
21723 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT.changeTypeToInteger(), Val);
21725 Val = convertToScalableVector(DAG, ContainerVT.changeTypeToInteger(), Val);
21726 Val = getSVESafeBitCast(ExtendVT, Val, DAG);
21727 Val = DAG.getNode(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU, DL, ContainerVT,
21728 Pg, Val, DAG.getUNDEF(ContainerVT));
21730 return convertFromScalableVector(DAG, VT, Val);
21734 AArch64TargetLowering::LowerFixedLengthFPRoundToSVE(SDValue Op,
21735 SelectionDAG &DAG) const {
21736 EVT VT = Op.getValueType();
21737 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21740 SDValue Val = Op.getOperand(0);
21741 EVT SrcVT = Val.getValueType();
21742 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
21743 EVT RoundVT = ContainerSrcVT.changeVectorElementType(
21744 VT.getVectorElementType());
21745 SDValue Pg = getPredicateForVector(DAG, DL, RoundVT);
21747 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
21748 Val = DAG.getNode(AArch64ISD::FP_ROUND_MERGE_PASSTHRU, DL, RoundVT, Pg, Val,
21749 Op.getOperand(1), DAG.getUNDEF(RoundVT));
21750 Val = getSVESafeBitCast(ContainerSrcVT.changeTypeToInteger(), Val, DAG);
21751 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
21753 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
21754 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
21758 AArch64TargetLowering::LowerFixedLengthIntToFPToSVE(SDValue Op,
21759 SelectionDAG &DAG) const {
21760 EVT VT = Op.getValueType();
21761 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21763 bool IsSigned = Op.getOpcode() == ISD::SINT_TO_FP;
21764 unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU
21765 : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU;
21768 SDValue Val = Op.getOperand(0);
21769 EVT SrcVT = Val.getValueType();
21770 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
21771 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
21773 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
21774 ContainerDstVT.getVectorElementType().getSizeInBits()) {
21775 SDValue Pg = getPredicateForVector(DAG, DL, VT);
21777 Val = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
21778 VT.changeTypeToInteger(), Val);
21780 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
21781 Val = getSVESafeBitCast(ContainerDstVT.changeTypeToInteger(), Val, DAG);
21782 // Safe to use a larger than specified operand since we just unpacked the
21783 // data, hence the upper bits are zero.
21784 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
21785 DAG.getUNDEF(ContainerDstVT));
21786 return convertFromScalableVector(DAG, VT, Val);
21788 EVT CvtVT = ContainerSrcVT.changeVectorElementType(
21789 ContainerDstVT.getVectorElementType());
21790 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
21792 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
21793 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
21794 Val = getSVESafeBitCast(ContainerSrcVT, Val, DAG);
21795 Val = convertFromScalableVector(DAG, SrcVT, Val);
21797 Val = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Val);
21798 return DAG.getNode(ISD::BITCAST, DL, VT, Val);
21803 AArch64TargetLowering::LowerFixedLengthFPToIntToSVE(SDValue Op,
21804 SelectionDAG &DAG) const {
21805 EVT VT = Op.getValueType();
21806 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21808 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
21809 unsigned Opcode = IsSigned ? AArch64ISD::FCVTZS_MERGE_PASSTHRU
21810 : AArch64ISD::FCVTZU_MERGE_PASSTHRU;
21813 SDValue Val = Op.getOperand(0);
21814 EVT SrcVT = Val.getValueType();
21815 EVT ContainerDstVT = getContainerForFixedLengthVector(DAG, VT);
21816 EVT ContainerSrcVT = getContainerForFixedLengthVector(DAG, SrcVT);
21818 if (ContainerSrcVT.getVectorElementType().getSizeInBits() <=
21819 ContainerDstVT.getVectorElementType().getSizeInBits()) {
21820 EVT CvtVT = ContainerDstVT.changeVectorElementType(
21821 ContainerSrcVT.getVectorElementType());
21822 SDValue Pg = getPredicateForVector(DAG, DL, VT);
21824 Val = DAG.getNode(ISD::BITCAST, DL, SrcVT.changeTypeToInteger(), Val);
21825 Val = DAG.getNode(ISD::ANY_EXTEND, DL, VT, Val);
21827 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
21828 Val = getSVESafeBitCast(CvtVT, Val, DAG);
21829 Val = DAG.getNode(Opcode, DL, ContainerDstVT, Pg, Val,
21830 DAG.getUNDEF(ContainerDstVT));
21831 return convertFromScalableVector(DAG, VT, Val);
21833 EVT CvtVT = ContainerSrcVT.changeTypeToInteger();
21834 SDValue Pg = getPredicateForVector(DAG, DL, CvtVT);
21836 // Safe to use a larger than specified result since an fp_to_int where the
21837 // result doesn't fit into the destination is undefined.
21838 Val = convertToScalableVector(DAG, ContainerSrcVT, Val);
21839 Val = DAG.getNode(Opcode, DL, CvtVT, Pg, Val, DAG.getUNDEF(CvtVT));
21840 Val = convertFromScalableVector(DAG, SrcVT.changeTypeToInteger(), Val);
21842 return DAG.getNode(ISD::TRUNCATE, DL, VT, Val);
21846 SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
21847 SDValue Op, SelectionDAG &DAG) const {
21848 EVT VT = Op.getValueType();
21849 assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
21851 auto *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
21852 auto ShuffleMask = SVN->getMask();
21855 SDValue Op1 = Op.getOperand(0);
21856 SDValue Op2 = Op.getOperand(1);
21858 EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
21859 Op1 = convertToScalableVector(DAG, ContainerVT, Op1);
21860 Op2 = convertToScalableVector(DAG, ContainerVT, Op2);
21862 bool ReverseEXT = false;
21864 if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm) &&
21865 Imm == VT.getVectorNumElements() - 1) {
21867 std::swap(Op1, Op2);
21869 EVT ScalarTy = VT.getVectorElementType();
21870 if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
21871 ScalarTy = MVT::i32;
21872 SDValue Scalar = DAG.getNode(
21873 ISD::EXTRACT_VECTOR_ELT, DL, ScalarTy, Op1,
21874 DAG.getConstant(VT.getVectorNumElements() - 1, DL, MVT::i64));
21875 Op = DAG.getNode(AArch64ISD::INSR, DL, ContainerVT, Op2, Scalar);
21876 return convertFromScalableVector(DAG, VT, Op);
21879 for (unsigned LaneSize : {64U, 32U, 16U}) {
21880 if (isREVMask(ShuffleMask, VT, LaneSize)) {
21882 getPackedSVEVectorVT(EVT::getIntegerVT(*DAG.getContext(), LaneSize));
21884 unsigned EltSz = VT.getScalarSizeInBits();
21886 RevOp = AArch64ISD::BSWAP_MERGE_PASSTHRU;
21887 else if (EltSz == 16)
21888 RevOp = AArch64ISD::REVH_MERGE_PASSTHRU;
21890 RevOp = AArch64ISD::REVW_MERGE_PASSTHRU;
21892 Op = DAG.getNode(ISD::BITCAST, DL, NewVT, Op1);
21893 Op = LowerToPredicatedOp(Op, DAG, RevOp);
21894 Op = DAG.getNode(ISD::BITCAST, DL, ContainerVT, Op);
21895 return convertFromScalableVector(DAG, VT, Op);
21899 unsigned WhichResult;
21900 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
21901 return convertFromScalableVector(
21902 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2));
21904 if (isTRNMask(ShuffleMask, VT, WhichResult)) {
21905 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
21906 return convertFromScalableVector(
21907 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
21910 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
21911 return convertFromScalableVector(
21912 DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op1));
21914 if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
21915 unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
21916 return convertFromScalableVector(
21917 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
21920 // Functions like isZIPMask return true when a ISD::VECTOR_SHUFFLE's mask
21921 // represents the same logical operation as performed by a ZIP instruction. In
21922 // isolation these functions do not mean the ISD::VECTOR_SHUFFLE is exactly
21923 // equivalent to an AArch64 instruction. There's the extra component of
21924 // ISD::VECTOR_SHUFFLE's value type to consider. Prior to SVE these functions
21925 // only operated on 64/128bit vector types that have a direct mapping to a
21926 // target register and so an exact mapping is implied.
21927 // However, when using SVE for fixed length vectors, most legal vector types
21928 // are actually sub-vectors of a larger SVE register. When mapping
21929 // ISD::VECTOR_SHUFFLE to an SVE instruction care must be taken to consider
21930 // how the mask's indices translate. Specifically, when the mapping requires
21931 // an exact meaning for a specific vector index (e.g. Index X is the last
21932 // vector element in the register) then such mappings are often only safe when
21933 // the exact SVE register size is know. The main exception to this is when
21934 // indices are logically relative to the first element of either
21935 // ISD::VECTOR_SHUFFLE operand because these relative indices don't change
21936 // when converting from fixed-length to scalable vector types (i.e. the start
21937 // of a fixed length vector is always the start of a scalable vector).
21938 unsigned MinSVESize = Subtarget->getMinSVEVectorSizeInBits();
21939 unsigned MaxSVESize = Subtarget->getMaxSVEVectorSizeInBits();
21940 if (MinSVESize == MaxSVESize && MaxSVESize == VT.getSizeInBits()) {
21941 if (ShuffleVectorInst::isReverseMask(ShuffleMask) && Op2.isUndef()) {
21942 Op = DAG.getNode(ISD::VECTOR_REVERSE, DL, ContainerVT, Op1);
21943 return convertFromScalableVector(DAG, VT, Op);
21946 if (isZIPMask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
21947 return convertFromScalableVector(
21948 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2));
21950 if (isUZPMask(ShuffleMask, VT, WhichResult)) {
21951 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
21952 return convertFromScalableVector(
21953 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
21956 if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult != 0)
21957 return convertFromScalableVector(
21958 DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op1));
21960 if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
21961 unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
21962 return convertFromScalableVector(
21963 DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op1));
21970 SDValue AArch64TargetLowering::getSVESafeBitCast(EVT VT, SDValue Op,
21971 SelectionDAG &DAG) const {
21973 EVT InVT = Op.getValueType();
21975 assert(VT.isScalableVector() && isTypeLegal(VT) &&
21976 InVT.isScalableVector() && isTypeLegal(InVT) &&
21977 "Only expect to cast between legal scalable vector types!");
21978 assert(VT.getVectorElementType() != MVT::i1 &&
21979 InVT.getVectorElementType() != MVT::i1 &&
21980 "For predicate bitcasts, use getSVEPredicateBitCast");
21985 EVT PackedVT = getPackedSVEVectorVT(VT.getVectorElementType());
21986 EVT PackedInVT = getPackedSVEVectorVT(InVT.getVectorElementType());
21988 // Safe bitcasting between unpacked vector types of different element counts
21989 // is currently unsupported because the following is missing the necessary
21990 // work to ensure the result's elements live where they're supposed to within
21991 // an SVE register.
21993 // e.g. nxv2i32 = XX??XX??
21994 // nxv4f16 = X?X?X?X?
21995 assert((VT.getVectorElementCount() == InVT.getVectorElementCount() ||
21996 VT == PackedVT || InVT == PackedInVT) &&
21997 "Unexpected bitcast!");
21999 // Pack input if required.
22000 if (InVT != PackedInVT)
22001 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, PackedInVT, Op);
22003 Op = DAG.getNode(ISD::BITCAST, DL, PackedVT, Op);
22005 // Unpack result if required.
22006 if (VT != PackedVT)
22007 Op = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, VT, Op);
22012 bool AArch64TargetLowering::isAllActivePredicate(SelectionDAG &DAG,
22014 return ::isAllActivePredicate(DAG, N);
22017 EVT AArch64TargetLowering::getPromotedVTForPredicate(EVT VT) const {
22018 return ::getPromotedVTForPredicate(VT);
22021 bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
22022 SDValue Op, const APInt &OriginalDemandedBits,
22023 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
22024 unsigned Depth) const {
22026 unsigned Opc = Op.getOpcode();
22028 case AArch64ISD::VSHL: {
22029 // Match (VSHL (VLSHR Val X) X)
22030 SDValue ShiftL = Op;
22031 SDValue ShiftR = Op->getOperand(0);
22032 if (ShiftR->getOpcode() != AArch64ISD::VLSHR)
22035 if (!ShiftL.hasOneUse() || !ShiftR.hasOneUse())
22038 unsigned ShiftLBits = ShiftL->getConstantOperandVal(1);
22039 unsigned ShiftRBits = ShiftR->getConstantOperandVal(1);
22041 // Other cases can be handled as well, but this is not
22043 if (ShiftRBits != ShiftLBits)
22046 unsigned ScalarSize = Op.getScalarValueSizeInBits();
22047 assert(ScalarSize > ShiftLBits && "Invalid shift imm");
22049 APInt ZeroBits = APInt::getLowBitsSet(ScalarSize, ShiftLBits);
22050 APInt UnusedBits = ~OriginalDemandedBits;
22052 if ((ZeroBits & UnusedBits) != ZeroBits)
22055 // All bits that are zeroed by (VSHL (VLSHR Val X) X) are not
22056 // used - simplify to just Val.
22057 return TLO.CombineTo(Op, ShiftR->getOperand(0));
22061 return TargetLowering::SimplifyDemandedBitsForTargetNode(
22062 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
22065 bool AArch64TargetLowering::isTargetCanonicalConstantNode(SDValue Op) const {
22066 return Op.getOpcode() == AArch64ISD::DUP ||
22067 (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
22068 Op.getOperand(0).getOpcode() == AArch64ISD::DUP) ||
22069 TargetLowering::isTargetCanonicalConstantNode(Op);
22072 bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
22073 unsigned Opc, LLT Ty1, LLT Ty2) const {
22074 return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));