1 //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run
10 // both before and after the DAG is legalized.
12 // This pass is not a substitute for the LLVM IR instcombine pass. This pass is
13 // primarily intended to handle simplification opportunities that are implicit
14 // in the LLVM IR and exposed by the various codegen lowering phases.
16 //===----------------------------------------------------------------------===//
18 #include "llvm/ADT/APFloat.h"
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/DenseMap.h"
22 #include "llvm/ADT/IntervalMap.h"
23 #include "llvm/ADT/None.h"
24 #include "llvm/ADT/Optional.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallBitVector.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/AliasAnalysis.h"
33 #include "llvm/Analysis/MemoryLocation.h"
34 #include "llvm/Analysis/TargetLibraryInfo.h"
35 #include "llvm/Analysis/VectorUtils.h"
36 #include "llvm/CodeGen/DAGCombine.h"
37 #include "llvm/CodeGen/ISDOpcodes.h"
38 #include "llvm/CodeGen/MachineFunction.h"
39 #include "llvm/CodeGen/MachineMemOperand.h"
40 #include "llvm/CodeGen/RuntimeLibcalls.h"
41 #include "llvm/CodeGen/SelectionDAG.h"
42 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
43 #include "llvm/CodeGen/SelectionDAGNodes.h"
44 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
45 #include "llvm/CodeGen/TargetLowering.h"
46 #include "llvm/CodeGen/TargetRegisterInfo.h"
47 #include "llvm/CodeGen/TargetSubtargetInfo.h"
48 #include "llvm/CodeGen/ValueTypes.h"
49 #include "llvm/IR/Attributes.h"
50 #include "llvm/IR/Constant.h"
51 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Function.h"
54 #include "llvm/IR/Metadata.h"
55 #include "llvm/Support/Casting.h"
56 #include "llvm/Support/CodeGen.h"
57 #include "llvm/Support/CommandLine.h"
58 #include "llvm/Support/Compiler.h"
59 #include "llvm/Support/Debug.h"
60 #include "llvm/Support/ErrorHandling.h"
61 #include "llvm/Support/KnownBits.h"
62 #include "llvm/Support/MachineValueType.h"
63 #include "llvm/Support/MathExtras.h"
64 #include "llvm/Support/raw_ostream.h"
65 #include "llvm/Target/TargetMachine.h"
66 #include "llvm/Target/TargetOptions.h"
78 #define DEBUG_TYPE "dagcombine"
80 STATISTIC(NodesCombined , "Number of dag nodes combined");
81 STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
82 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
83 STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
84 STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
85 STATISTIC(SlicedLoads, "Number of load sliced");
86 STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops");
89 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
90 cl::desc("Enable DAG combiner's use of IR alias analysis"));
93 UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
94 cl::desc("Enable DAG combiner's use of TBAA"));
97 static cl::opt<std::string>
98 CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
99 cl::desc("Only use DAG-combiner alias analysis in this"
103 /// Hidden option to stress test load slicing, i.e., when this option
104 /// is enabled, load slicing bypasses most of its profitability guards.
106 StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
107 cl::desc("Bypass the profitability model of load slicing"),
111 MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
112 cl::desc("DAG combiner may split indexing from loads"));
115 EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true),
116 cl::desc("DAG combiner enable merging multiple stores "
117 "into a wider store"));
119 static cl::opt<unsigned> TokenFactorInlineLimit(
120 "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048),
121 cl::desc("Limit the number of operands to inline for Token Factors"));
123 static cl::opt<unsigned> StoreMergeDependenceLimit(
124 "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10),
125 cl::desc("Limit the number of times for the same StoreNode and RootNode "
126 "to bail out in store merging dependence check"));
128 static cl::opt<bool> EnableReduceLoadOpStoreWidth(
129 "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
130 cl::desc("DAG combiner enable reducing the width of load/op/store "
133 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
134 "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
135 cl::desc("DAG combiner enable load/<replace bytes>/store with "
136 "a narrower store"));
142 const TargetLowering &TLI;
143 const SelectionDAGTargetInfo *STI;
144 CombineLevel Level = BeforeLegalizeTypes;
145 CodeGenOpt::Level OptLevel;
146 bool LegalDAG = false;
147 bool LegalOperations = false;
148 bool LegalTypes = false;
150 bool DisableGenericCombines;
152 /// Worklist of all of the nodes that need to be simplified.
154 /// This must behave as a stack -- new nodes to process are pushed onto the
155 /// back and when processing we pop off of the back.
157 /// The worklist will not contain duplicates but may contain null entries
158 /// due to nodes being deleted from the underlying DAG.
159 SmallVector<SDNode *, 64> Worklist;
161 /// Mapping from an SDNode to its position on the worklist.
163 /// This is used to find and remove nodes from the worklist (by nulling
164 /// them) when they are deleted from the underlying DAG. It relies on
165 /// stable indices of nodes within the worklist.
166 DenseMap<SDNode *, unsigned> WorklistMap;
167 /// This records all nodes attempted to add to the worklist since we
168 /// considered a new worklist entry. As we keep do not add duplicate nodes
169 /// in the worklist, this is different from the tail of the worklist.
170 SmallSetVector<SDNode *, 32> PruningList;
172 /// Set of nodes which have been combined (at least once).
174 /// This is used to allow us to reliably add any operands of a DAG node
175 /// which have not yet been combined to the worklist.
176 SmallPtrSet<SDNode *, 32> CombinedNodes;
178 /// Map from candidate StoreNode to the pair of RootNode and count.
179 /// The count is used to track how many times we have seen the StoreNode
180 /// with the same RootNode bail out in dependence check. If we have seen
181 /// the bail out for the same pair many times over a limit, we won't
182 /// consider the StoreNode with the same RootNode as store merging
184 DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap;
186 // AA - Used for DAG load/store alias analysis.
189 /// When an instruction is simplified, add all users of the instruction to
190 /// the work lists because they might get more simplified now.
191 void AddUsersToWorklist(SDNode *N) {
192 for (SDNode *Node : N->uses())
196 /// Convenient shorthand to add a node and all of its user to the worklist.
197 void AddToWorklistWithUsers(SDNode *N) {
198 AddUsersToWorklist(N);
202 // Prune potentially dangling nodes. This is called after
203 // any visit to a node, but should also be called during a visit after any
204 // failed combine which may have created a DAG node.
205 void clearAddedDanglingWorklistEntries() {
206 // Check any nodes added to the worklist to see if they are prunable.
207 while (!PruningList.empty()) {
208 auto *N = PruningList.pop_back_val();
210 recursivelyDeleteUnusedNodes(N);
214 SDNode *getNextWorklistEntry() {
215 // Before we do any work, remove nodes that are not in use.
216 clearAddedDanglingWorklistEntries();
218 // The Worklist holds the SDNodes in order, but it may contain null
220 while (!N && !Worklist.empty()) {
221 N = Worklist.pop_back_val();
225 bool GoodWorklistEntry = WorklistMap.erase(N);
226 (void)GoodWorklistEntry;
227 assert(GoodWorklistEntry &&
228 "Found a worklist entry without a corresponding map entry!");
233 /// Call the node-specific routine that folds each particular type of node.
234 SDValue visit(SDNode *N);
237 DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
238 : DAG(D), TLI(D.getTargetLoweringInfo()),
239 STI(D.getSubtarget().getSelectionDAGInfo()), OptLevel(OL), AA(AA) {
240 ForCodeSize = DAG.shouldOptForSize();
241 DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);
243 MaximumLegalStoreInBits = 0;
244 // We use the minimum store size here, since that's all we can guarantee
245 // for the scalable vector types.
246 for (MVT VT : MVT::all_valuetypes())
247 if (EVT(VT).isSimple() && VT != MVT::Other &&
248 TLI.isTypeLegal(EVT(VT)) &&
249 VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits)
250 MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize();
253 void ConsiderForPruning(SDNode *N) {
254 // Mark this for potential pruning.
255 PruningList.insert(N);
258 /// Add to the worklist making sure its instance is at the back (next to be
260 void AddToWorklist(SDNode *N) {
261 assert(N->getOpcode() != ISD::DELETED_NODE &&
262 "Deleted Node added to Worklist");
264 // Skip handle nodes as they can't usefully be combined and confuse the
265 // zero-use deletion strategy.
266 if (N->getOpcode() == ISD::HANDLENODE)
269 ConsiderForPruning(N);
271 if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
272 Worklist.push_back(N);
275 /// Remove all instances of N from the worklist.
276 void removeFromWorklist(SDNode *N) {
277 CombinedNodes.erase(N);
278 PruningList.remove(N);
279 StoreRootCountMap.erase(N);
281 auto It = WorklistMap.find(N);
282 if (It == WorklistMap.end())
283 return; // Not in the worklist.
285 // Null out the entry rather than erasing it to avoid a linear operation.
286 Worklist[It->second] = nullptr;
287 WorklistMap.erase(It);
290 void deleteAndRecombine(SDNode *N);
291 bool recursivelyDeleteUnusedNodes(SDNode *N);
293 /// Replaces all uses of the results of one DAG node with new values.
294 SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
297 /// Replaces all uses of the results of one DAG node with new values.
298 SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
299 return CombineTo(N, &Res, 1, AddTo);
302 /// Replaces all uses of the results of one DAG node with new values.
303 SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
305 SDValue To[] = { Res0, Res1 };
306 return CombineTo(N, To, 2, AddTo);
309 void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
312 unsigned MaximumLegalStoreInBits;
314 /// Check the specified integer node value to see if it can be simplified or
315 /// if things it uses can be simplified by bit propagation.
316 /// If so, return true.
317 bool SimplifyDemandedBits(SDValue Op) {
318 unsigned BitWidth = Op.getScalarValueSizeInBits();
319 APInt DemandedBits = APInt::getAllOnes(BitWidth);
320 return SimplifyDemandedBits(Op, DemandedBits);
323 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) {
324 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
326 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false))
330 AddToWorklist(Op.getNode());
332 CommitTargetLoweringOpt(TLO);
336 /// Check the specified vector node value to see if it can be simplified or
337 /// if things it uses can be simplified as it only uses some of the
338 /// elements. If so, return true.
339 bool SimplifyDemandedVectorElts(SDValue Op) {
340 // TODO: For now just pretend it cannot be simplified.
341 if (Op.getValueType().isScalableVector())
344 unsigned NumElts = Op.getValueType().getVectorNumElements();
345 APInt DemandedElts = APInt::getAllOnes(NumElts);
346 return SimplifyDemandedVectorElts(Op, DemandedElts);
349 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
350 const APInt &DemandedElts,
351 bool AssumeSingleUse = false);
352 bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
353 bool AssumeSingleUse = false);
355 bool CombineToPreIndexedLoadStore(SDNode *N);
356 bool CombineToPostIndexedLoadStore(SDNode *N);
357 SDValue SplitIndexingFromLoad(LoadSDNode *LD);
358 bool SliceUpLoad(SDNode *N);
360 // Scalars have size 0 to distinguish from singleton vectors.
361 SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
362 bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
363 bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);
365 /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
368 /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
369 /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
370 /// \param EltNo index of the vector element to load.
371 /// \param OriginalLoad load that EVE came from to be replaced.
372 /// \returns EVE on success SDValue() on failure.
373 SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
375 LoadSDNode *OriginalLoad);
376 void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
377 SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
378 SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
379 SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
380 SDValue PromoteIntBinOp(SDValue Op);
381 SDValue PromoteIntShiftOp(SDValue Op);
382 SDValue PromoteExtend(SDValue Op);
383 bool PromoteLoad(SDValue Op);
385 /// Call the node-specific routine that knows how to fold each
386 /// particular type of node. If that doesn't do anything, try the
387 /// target-specific DAG combines.
388 SDValue combine(SDNode *N);
390 // Visitation implementation - Implement dag node combining for different
391 // node types. The semantics are as follows:
393 // SDValue.getNode() == 0 - No change was made
394 // SDValue.getNode() == N - N was replaced, is dead and has been handled.
395 // otherwise - N should be replaced by the returned Operand.
397 SDValue visitTokenFactor(SDNode *N);
398 SDValue visitMERGE_VALUES(SDNode *N);
399 SDValue visitADD(SDNode *N);
400 SDValue visitADDLike(SDNode *N);
401 SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference);
402 SDValue visitSUB(SDNode *N);
403 SDValue visitADDSAT(SDNode *N);
404 SDValue visitSUBSAT(SDNode *N);
405 SDValue visitADDC(SDNode *N);
406 SDValue visitADDO(SDNode *N);
407 SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
408 SDValue visitSUBC(SDNode *N);
409 SDValue visitSUBO(SDNode *N);
410 SDValue visitADDE(SDNode *N);
411 SDValue visitADDCARRY(SDNode *N);
412 SDValue visitSADDO_CARRY(SDNode *N);
413 SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
414 SDValue visitSUBE(SDNode *N);
415 SDValue visitSUBCARRY(SDNode *N);
416 SDValue visitSSUBO_CARRY(SDNode *N);
417 SDValue visitMUL(SDNode *N);
418 SDValue visitMULFIX(SDNode *N);
419 SDValue useDivRem(SDNode *N);
420 SDValue visitSDIV(SDNode *N);
421 SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
422 SDValue visitUDIV(SDNode *N);
423 SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N);
424 SDValue visitREM(SDNode *N);
425 SDValue visitMULHU(SDNode *N);
426 SDValue visitMULHS(SDNode *N);
427 SDValue visitAVG(SDNode *N);
428 SDValue visitSMUL_LOHI(SDNode *N);
429 SDValue visitUMUL_LOHI(SDNode *N);
430 SDValue visitMULO(SDNode *N);
431 SDValue visitIMINMAX(SDNode *N);
432 SDValue visitAND(SDNode *N);
433 SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N);
434 SDValue visitOR(SDNode *N);
435 SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
436 SDValue visitXOR(SDNode *N);
437 SDValue SimplifyVBinOp(SDNode *N, const SDLoc &DL);
438 SDValue visitSHL(SDNode *N);
439 SDValue visitSRA(SDNode *N);
440 SDValue visitSRL(SDNode *N);
441 SDValue visitFunnelShift(SDNode *N);
442 SDValue visitSHLSAT(SDNode *N);
443 SDValue visitRotate(SDNode *N);
444 SDValue visitABS(SDNode *N);
445 SDValue visitBSWAP(SDNode *N);
446 SDValue visitBITREVERSE(SDNode *N);
447 SDValue visitCTLZ(SDNode *N);
448 SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
449 SDValue visitCTTZ(SDNode *N);
450 SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
451 SDValue visitCTPOP(SDNode *N);
452 SDValue visitSELECT(SDNode *N);
453 SDValue visitVSELECT(SDNode *N);
454 SDValue visitSELECT_CC(SDNode *N);
455 SDValue visitSETCC(SDNode *N);
456 SDValue visitSETCCCARRY(SDNode *N);
457 SDValue visitSIGN_EXTEND(SDNode *N);
458 SDValue visitZERO_EXTEND(SDNode *N);
459 SDValue visitANY_EXTEND(SDNode *N);
460 SDValue visitAssertExt(SDNode *N);
461 SDValue visitAssertAlign(SDNode *N);
462 SDValue visitSIGN_EXTEND_INREG(SDNode *N);
463 SDValue visitEXTEND_VECTOR_INREG(SDNode *N);
464 SDValue visitTRUNCATE(SDNode *N);
465 SDValue visitBITCAST(SDNode *N);
466 SDValue visitFREEZE(SDNode *N);
467 SDValue visitBUILD_PAIR(SDNode *N);
468 SDValue visitFADD(SDNode *N);
469 SDValue visitSTRICT_FADD(SDNode *N);
470 SDValue visitFSUB(SDNode *N);
471 SDValue visitFMUL(SDNode *N);
472 SDValue visitFMA(SDNode *N);
473 SDValue visitFDIV(SDNode *N);
474 SDValue visitFREM(SDNode *N);
475 SDValue visitFSQRT(SDNode *N);
476 SDValue visitFCOPYSIGN(SDNode *N);
477 SDValue visitFPOW(SDNode *N);
478 SDValue visitSINT_TO_FP(SDNode *N);
479 SDValue visitUINT_TO_FP(SDNode *N);
480 SDValue visitFP_TO_SINT(SDNode *N);
481 SDValue visitFP_TO_UINT(SDNode *N);
482 SDValue visitFP_ROUND(SDNode *N);
483 SDValue visitFP_EXTEND(SDNode *N);
484 SDValue visitFNEG(SDNode *N);
485 SDValue visitFABS(SDNode *N);
486 SDValue visitFCEIL(SDNode *N);
487 SDValue visitFTRUNC(SDNode *N);
488 SDValue visitFFLOOR(SDNode *N);
489 SDValue visitFMinMax(SDNode *N);
490 SDValue visitBRCOND(SDNode *N);
491 SDValue visitBR_CC(SDNode *N);
492 SDValue visitLOAD(SDNode *N);
494 SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
495 SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
497 SDValue visitSTORE(SDNode *N);
498 SDValue visitLIFETIME_END(SDNode *N);
499 SDValue visitINSERT_VECTOR_ELT(SDNode *N);
500 SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
501 SDValue visitBUILD_VECTOR(SDNode *N);
502 SDValue visitCONCAT_VECTORS(SDNode *N);
503 SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
504 SDValue visitVECTOR_SHUFFLE(SDNode *N);
505 SDValue visitSCALAR_TO_VECTOR(SDNode *N);
506 SDValue visitINSERT_SUBVECTOR(SDNode *N);
507 SDValue visitMLOAD(SDNode *N);
508 SDValue visitMSTORE(SDNode *N);
509 SDValue visitMGATHER(SDNode *N);
510 SDValue visitMSCATTER(SDNode *N);
511 SDValue visitFP_TO_FP16(SDNode *N);
512 SDValue visitFP16_TO_FP(SDNode *N);
513 SDValue visitFP_TO_BF16(SDNode *N);
514 SDValue visitVECREDUCE(SDNode *N);
515 SDValue visitVPOp(SDNode *N);
517 SDValue visitFADDForFMACombine(SDNode *N);
518 SDValue visitFSUBForFMACombine(SDNode *N);
519 SDValue visitFMULForFMADistributiveCombine(SDNode *N);
521 SDValue XformToShuffleWithZero(SDNode *N);
522 bool reassociationCanBreakAddressingModePattern(unsigned Opc,
527 SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
529 SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
530 SDValue N1, SDNodeFlags Flags);
532 SDValue visitShiftByConstant(SDNode *N);
534 SDValue foldSelectOfConstants(SDNode *N);
535 SDValue foldVSelectOfConstants(SDNode *N);
536 SDValue foldBinOpIntoSelect(SDNode *BO);
537 bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
538 SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N);
539 SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
540 SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
541 SDValue N2, SDValue N3, ISD::CondCode CC,
542 bool NotExtCompare = false);
543 SDValue convertSelectOfFPConstantsToLoadOffset(
544 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
546 SDValue foldSignChangeInBitcast(SDNode *N);
547 SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
548 SDValue N2, SDValue N3, ISD::CondCode CC);
549 SDValue foldSelectOfBinops(SDNode *N);
550 SDValue foldSextSetcc(SDNode *N);
551 SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
553 SDValue foldSubToUSubSat(EVT DstVT, SDNode *N);
554 SDValue unfoldMaskedMerge(SDNode *N);
555 SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
556 SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
557 const SDLoc &DL, bool foldBooleans);
558 SDValue rebuildSetCC(SDValue N);
560 bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
561 SDValue &CC, bool MatchStrict = false) const;
562 bool isOneUseSetCC(SDValue N) const;
564 SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
566 SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
567 SDValue CombineExtLoad(SDNode *N);
568 SDValue CombineZExtLogicopShiftLoad(SDNode *N);
569 SDValue combineRepeatedFPDivisors(SDNode *N);
570 SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
571 SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
572 SDValue BuildSDIV(SDNode *N);
573 SDValue BuildSDIVPow2(SDNode *N);
574 SDValue BuildUDIV(SDNode *N);
575 SDValue BuildSREMPow2(SDNode *N);
576 SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N);
577 SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
578 SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
579 SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
580 SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
581 SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
582 SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
583 SDNodeFlags Flags, bool Reciprocal);
584 SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
585 SDNodeFlags Flags, bool Reciprocal);
586 SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
587 bool DemandHighBits = true);
588 SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
589 SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
590 SDValue InnerPos, SDValue InnerNeg, bool HasPos,
591 unsigned PosOpcode, unsigned NegOpcode,
593 SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
594 SDValue InnerPos, SDValue InnerNeg, bool HasPos,
595 unsigned PosOpcode, unsigned NegOpcode,
597 SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
598 SDValue MatchLoadCombine(SDNode *N);
599 SDValue mergeTruncStores(StoreSDNode *N);
600 SDValue reduceLoadWidth(SDNode *N);
601 SDValue ReduceLoadOpStoreWidth(SDNode *N);
602 SDValue splitMergedValStore(StoreSDNode *ST);
603 SDValue TransformFPLoadStorePair(SDNode *N);
604 SDValue convertBuildVecZextToZext(SDNode *N);
605 SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
606 SDValue reduceBuildVecTruncToBitCast(SDNode *N);
607 SDValue reduceBuildVecToShuffle(SDNode *N);
608 SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
609 ArrayRef<int> VectorMask, SDValue VecIn1,
610 SDValue VecIn2, unsigned LeftIdx,
612 SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
614 /// Walk up chain skipping non-aliasing memory nodes,
615 /// looking for aliasing nodes and adding them to the Aliases vector.
616 void GatherAllAliases(SDNode *N, SDValue OriginalChain,
617 SmallVectorImpl<SDValue> &Aliases);
619 /// Return true if there is any possibility that the two addresses overlap.
620 bool mayAlias(SDNode *Op0, SDNode *Op1) const;
622 /// Walk up chain skipping non-aliasing memory nodes, looking for a better
623 /// chain (aliasing node.)
624 SDValue FindBetterChain(SDNode *N, SDValue Chain);
626 /// Try to replace a store and any possibly adjacent stores on
627 /// consecutive chains with better chains. Return true only if St is
630 /// Notice that other chains may still be replaced even if the function
632 bool findBetterNeighborChains(StoreSDNode *St);
634 // Helper for findBetterNeighborChains. Walk up store chain add additional
635 // chained stores that do not overlap and can be parallelized.
636 bool parallelizeChainedStores(StoreSDNode *St);
638 /// Holds a pointer to an LSBaseSDNode as well as information on where it
639 /// is located in a sequence of memory operations connected by a chain.
641 // Ptr to the mem node.
642 LSBaseSDNode *MemNode;
644 // Offset from the base ptr.
645 int64_t OffsetFromBase;
647 MemOpLink(LSBaseSDNode *N, int64_t Offset)
648 : MemNode(N), OffsetFromBase(Offset) {}
651 // Classify the origin of a stored value.
652 enum class StoreSource { Unknown, Constant, Extract, Load };
653 StoreSource getStoreSource(SDValue StoreVal) {
654 switch (StoreVal.getOpcode()) {
656 case ISD::ConstantFP:
657 return StoreSource::Constant;
658 case ISD::EXTRACT_VECTOR_ELT:
659 case ISD::EXTRACT_SUBVECTOR:
660 return StoreSource::Extract;
662 return StoreSource::Load;
664 return StoreSource::Unknown;
668 /// This is a helper function for visitMUL to check the profitability
669 /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
670 /// MulNode is the original multiply, AddNode is (add x, c1),
671 /// and ConstNode is c2.
672 bool isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
675 /// This is a helper function for visitAND and visitZERO_EXTEND. Returns
676 /// true if the (and (load x) c) pattern matches an extload. ExtVT returns
677 /// the type of the loaded value to be extended.
678 bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
679 EVT LoadResultTy, EVT &ExtVT);
681 /// Helper function to calculate whether the given Load/Store can have its
682 /// width reduced to ExtVT.
683 bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType,
684 EVT &MemVT, unsigned ShAmt = 0);
686 /// Used by BackwardsPropagateMask to find suitable loads.
687 bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads,
688 SmallPtrSetImpl<SDNode*> &NodesWithConsts,
689 ConstantSDNode *Mask, SDNode *&NodeToMask);
690 /// Attempt to propagate a given AND node back to load leaves so that they
691 /// can be combined into narrow loads.
692 bool BackwardsPropagateMask(SDNode *N);
694 /// Helper function for mergeConsecutiveStores which merges the component
696 SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
699 /// This is a helper function for mergeConsecutiveStores. When the source
700 /// elements of the consecutive stores are all constants or all extracted
701 /// vector elements, try to merge them into one larger store introducing
702 /// bitcasts if necessary. \return True if a merged store was created.
703 bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
704 EVT MemVT, unsigned NumStores,
705 bool IsConstantSrc, bool UseVector,
708 /// This is a helper function for mergeConsecutiveStores. Stores that
709 /// potentially may be merged with St are placed in StoreNodes. RootNode is
710 /// a chain predecessor to all store candidates.
711 void getStoreMergeCandidates(StoreSDNode *St,
712 SmallVectorImpl<MemOpLink> &StoreNodes,
715 /// Helper function for mergeConsecutiveStores. Checks if candidate stores
716 /// have indirect dependency through their operands. RootNode is the
717 /// predecessor to all stores calculated by getStoreMergeCandidates and is
718 /// used to prune the dependency check. \return True if safe to merge.
719 bool checkMergeStoreCandidatesForDependencies(
720 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
723 /// This is a helper function for mergeConsecutiveStores. Given a list of
724 /// store candidates, find the first N that are consecutive in memory.
725 /// Returns 0 if there are not at least 2 consecutive stores to try merging.
726 unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
727 int64_t ElementSizeBytes) const;
729 /// This is a helper function for mergeConsecutiveStores. It is used for
730 /// store chains that are composed entirely of constant values.
731 bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes,
732 unsigned NumConsecutiveStores,
733 EVT MemVT, SDNode *Root, bool AllowVectors);
735 /// This is a helper function for mergeConsecutiveStores. It is used for
736 /// store chains that are composed entirely of extracted vector elements.
737 /// When extracting multiple vector elements, try to store them in one
738 /// vector store rather than a sequence of scalar stores.
739 bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes,
740 unsigned NumConsecutiveStores, EVT MemVT,
743 /// This is a helper function for mergeConsecutiveStores. It is used for
744 /// store chains that are composed entirely of loaded values.
745 bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
746 unsigned NumConsecutiveStores, EVT MemVT,
747 SDNode *Root, bool AllowVectors,
748 bool IsNonTemporalStore, bool IsNonTemporalLoad);
750 /// Merge consecutive store operations into a wide store.
751 /// This optimization uses wide integers or vectors when possible.
752 /// \return true if stores were merged.
753 bool mergeConsecutiveStores(StoreSDNode *St);
755 /// Try to transform a truncation where C is a constant:
756 /// (trunc (and X, C)) -> (and (trunc X), (trunc C))
758 /// \p N needs to be a truncation and its first operand an AND. Other
759 /// requirements are checked by the function (e.g. that trunc is
760 /// single-use) and if missed an empty SDValue is returned.
761 SDValue distributeTruncateThroughAnd(SDNode *N);
763 /// Helper function to determine whether the target supports operation
764 /// given by \p Opcode for type \p VT, that is, whether the operation
765 /// is legal or custom before legalizing operations, and whether is
766 /// legal (but not custom) after legalization.
767 bool hasOperation(unsigned Opcode, EVT VT) {
768 return TLI.isOperationLegalOrCustom(Opcode, VT, LegalOperations);
772 /// Runs the dag combiner on all nodes in the work list
773 void Run(CombineLevel AtLevel);
775 SelectionDAG &getDAG() const { return DAG; }
777 /// Returns a type large enough to hold any valid shift amount - before type
778 /// legalization these can be huge.
779 EVT getShiftAmountTy(EVT LHSTy) {
780 assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
781 return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
784 /// This method returns true if we are running before type legalization or
785 /// if the specified VT is legal.
786 bool isTypeLegal(const EVT &VT) {
787 if (!LegalTypes) return true;
788 return TLI.isTypeLegal(VT);
791 /// Convenience wrapper around TargetLowering::getSetCCResultType
792 EVT getSetCCResultType(EVT VT) const {
793 return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
796 void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
797 SDValue OrigLoad, SDValue ExtLoad,
798 ISD::NodeType ExtType);
801 /// This class is a DAGUpdateListener that removes any deleted
802 /// nodes from the worklist.
803 class WorklistRemover : public SelectionDAG::DAGUpdateListener {
807 explicit WorklistRemover(DAGCombiner &dc)
808 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
810 void NodeDeleted(SDNode *N, SDNode *E) override {
811 DC.removeFromWorklist(N);
815 class WorklistInserter : public SelectionDAG::DAGUpdateListener {
819 explicit WorklistInserter(DAGCombiner &dc)
820 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
822 // FIXME: Ideally we could add N to the worklist, but this causes exponential
823 // compile time costs in large DAGs, e.g. Halide.
824 void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
827 } // end anonymous namespace
829 //===----------------------------------------------------------------------===//
830 // TargetLowering::DAGCombinerInfo implementation
831 //===----------------------------------------------------------------------===//
833 void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
834 ((DAGCombiner*)DC)->AddToWorklist(N);
837 SDValue TargetLowering::DAGCombinerInfo::
838 CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
839 return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
842 SDValue TargetLowering::DAGCombinerInfo::
843 CombineTo(SDNode *N, SDValue Res, bool AddTo) {
844 return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
847 SDValue TargetLowering::DAGCombinerInfo::
848 CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
849 return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
852 bool TargetLowering::DAGCombinerInfo::
853 recursivelyDeleteUnusedNodes(SDNode *N) {
854 return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
857 void TargetLowering::DAGCombinerInfo::
858 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
859 return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
862 //===----------------------------------------------------------------------===//
864 //===----------------------------------------------------------------------===//
866 void DAGCombiner::deleteAndRecombine(SDNode *N) {
867 removeFromWorklist(N);
869 // If the operands of this node are only used by the node, they will now be
870 // dead. Make sure to re-visit them and recursively delete dead nodes.
871 for (const SDValue &Op : N->ops())
872 // For an operand generating multiple values, one of the values may
873 // become dead allowing further simplification (e.g. split index
874 // arithmetic from an indexed load).
875 if (Op->hasOneUse() || Op->getNumValues() > 1)
876 AddToWorklist(Op.getNode());
881 // APInts must be the same size for most operations, this helper
882 // function zero extends the shorter of the pair so that they match.
883 // We provide an Offset so that we can create bitwidths that won't overflow.
884 static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
885 unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
886 LHS = LHS.zext(Bits);
887 RHS = RHS.zext(Bits);
890 // Return true if this node is a setcc, or is a select_cc
891 // that selects between the target values used for true and false, making it
892 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
893 // the appropriate nodes based on the type of node we are checking. This
894 // simplifies life a bit for the callers.
895 bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
896 SDValue &CC, bool MatchStrict) const {
897 if (N.getOpcode() == ISD::SETCC) {
898 LHS = N.getOperand(0);
899 RHS = N.getOperand(1);
900 CC = N.getOperand(2);
905 (N.getOpcode() == ISD::STRICT_FSETCC ||
906 N.getOpcode() == ISD::STRICT_FSETCCS)) {
907 LHS = N.getOperand(1);
908 RHS = N.getOperand(2);
909 CC = N.getOperand(3);
913 if (N.getOpcode() != ISD::SELECT_CC || !TLI.isConstTrueVal(N.getOperand(2)) ||
914 !TLI.isConstFalseVal(N.getOperand(3)))
917 if (TLI.getBooleanContents(N.getValueType()) ==
918 TargetLowering::UndefinedBooleanContent)
921 LHS = N.getOperand(0);
922 RHS = N.getOperand(1);
923 CC = N.getOperand(4);
927 /// Return true if this is a SetCC-equivalent operation with only one use.
928 /// If this is true, it allows the users to invert the operation for free when
929 /// it is profitable to do so.
930 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
932 if (isSetCCEquivalent(N, N0, N1, N2) && N->hasOneUse())
937 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
938 if (!ScalarTy.isSimple())
941 uint64_t MaskForTy = 0ULL;
942 switch (ScalarTy.getSimpleVT().SimpleTy) {
947 MaskForTy = 0xFFFFULL;
950 MaskForTy = 0xFFFFFFFFULL;
958 if (ISD::isConstantSplatVector(N, Val))
959 return Val.getLimitedValue() == MaskForTy;
964 // Determines if it is a constant integer or a splat/build vector of constant
965 // integers (and undefs).
966 // Do not permit build vector implicit truncation.
967 static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
968 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
969 return !(Const->isOpaque() && NoOpaques);
970 if (N.getOpcode() != ISD::BUILD_VECTOR && N.getOpcode() != ISD::SPLAT_VECTOR)
972 unsigned BitWidth = N.getScalarValueSizeInBits();
973 for (const SDValue &Op : N->op_values()) {
976 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
977 if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth ||
978 (Const->isOpaque() && NoOpaques))
984 // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
986 static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {
987 if (V.getOpcode() != ISD::BUILD_VECTOR)
989 return isConstantOrConstantVector(V, NoOpaques) ||
990 ISD::isBuildVectorOfConstantFPSDNodes(V.getNode());
993 // Determine if this an indexed load with an opaque target constant index.
994 static bool canSplitIdx(LoadSDNode *LD) {
995 return MaySplitLoadIndex &&
996 (LD->getOperand(2).getOpcode() != ISD::TargetConstant ||
997 !cast<ConstantSDNode>(LD->getOperand(2))->isOpaque());
1000 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
1005 // Currently this only tries to ensure we don't undo the GEP splits done by
1006 // CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this,
1007 // we check if the following transformation would be problematic:
1008 // (load/store (add, (add, x, offset1), offset2)) ->
1009 // (load/store (add, x, offset1+offset2)).
1011 // (load/store (add, (add, x, y), offset2)) ->
1012 // (load/store (add, (add, x, offset2), y)).
1014 if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD)
1017 auto *C2 = dyn_cast<ConstantSDNode>(N1);
1021 const APInt &C2APIntVal = C2->getAPIntValue();
1022 if (C2APIntVal.getSignificantBits() > 64)
1025 if (auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
1029 const APInt &C1APIntVal = C1->getAPIntValue();
1030 const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal;
1031 if (CombinedValueIntVal.getSignificantBits() > 64)
1033 const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
1035 for (SDNode *Node : N->uses()) {
1036 if (auto *LoadStore = dyn_cast<MemSDNode>(Node)) {
1037 // Is x[offset2] already not a legal addressing mode? If so then
1038 // reassociating the constants breaks nothing (we test offset2 because
1039 // that's the one we hope to fold into the load or store).
1040 TargetLoweringBase::AddrMode AM;
1041 AM.HasBaseReg = true;
1042 AM.BaseOffs = C2APIntVal.getSExtValue();
1043 EVT VT = LoadStore->getMemoryVT();
1044 unsigned AS = LoadStore->getAddressSpace();
1045 Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
1046 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
1049 // Would x[offset1+offset2] still be a legal addressing mode?
1050 AM.BaseOffs = CombinedValue;
1051 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
1056 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N0.getOperand(1)))
1057 if (GA->getOpcode() == ISD::GlobalAddress && TLI.isOffsetFoldingLegal(GA))
1060 for (SDNode *Node : N->uses()) {
1061 auto *LoadStore = dyn_cast<MemSDNode>(Node);
1065 // Is x[offset2] a legal addressing mode? If so then
1066 // reassociating the constants breaks address pattern
1067 TargetLoweringBase::AddrMode AM;
1068 AM.HasBaseReg = true;
1069 AM.BaseOffs = C2APIntVal.getSExtValue();
1070 EVT VT = LoadStore->getMemoryVT();
1071 unsigned AS = LoadStore->getAddressSpace();
1072 Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
1073 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
1082 // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression
1083 // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc.
1084 SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
1085 SDValue N0, SDValue N1) {
1086 EVT VT = N0.getValueType();
1088 if (N0.getOpcode() != Opc)
1091 SDValue N00 = N0.getOperand(0);
1092 SDValue N01 = N0.getOperand(1);
1094 if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
1095 if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
1096 // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
1097 if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
1098 return DAG.getNode(Opc, DL, VT, N00, OpNode);
1101 if (TLI.isReassocProfitable(DAG, N0, N1)) {
1102 // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
1103 // iff (op x, c1) has one use
1104 SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1);
1105 return DAG.getNode(Opc, DL, VT, OpNode, N01);
1109 // Check for repeated operand logic simplifications.
1110 if (Opc == ISD::AND || Opc == ISD::OR) {
1111 // (N00 & N01) & N00 --> N00 & N01
1112 // (N00 & N01) & N01 --> N00 & N01
1113 // (N00 | N01) | N00 --> N00 | N01
1114 // (N00 | N01) | N01 --> N00 | N01
1115 if (N1 == N00 || N1 == N01)
1118 if (Opc == ISD::XOR) {
1119 // (N00 ^ N01) ^ N00 --> N01
1122 // (N00 ^ N01) ^ N01 --> N00
1127 if (TLI.isReassocProfitable(DAG, N0, N1)) {
1129 // Reassociate if (op N00, N1) already exist
1130 if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N00, N1})) {
1131 // if Op (Op N00, N1), N01 already exist
1132 // we need to stop reassciate to avoid dead loop
1133 if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N01}))
1134 return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N01);
1139 // Reassociate if (op N01, N1) already exist
1140 if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N01, N1})) {
1141 // if Op (Op N01, N1), N00 already exist
1142 // we need to stop reassciate to avoid dead loop
1143 if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N00}))
1144 return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N00);
1152 // Try to reassociate commutative binops.
1153 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
1154 SDValue N1, SDNodeFlags Flags) {
1155 assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
1157 // Floating-point reassociation is not allowed without loose FP math.
1158 if (N0.getValueType().isFloatingPoint() ||
1159 N1.getValueType().isFloatingPoint())
1160 if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros())
1163 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1))
1165 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0))
1170 SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
1172 assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
1174 LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
1176 dbgs() << " and " << NumTo - 1 << " other values\n");
1177 for (unsigned i = 0, e = NumTo; i != e; ++i)
1178 assert((!To[i].getNode() ||
1179 N->getValueType(i) == To[i].getValueType()) &&
1180 "Cannot combine value to value of different type!");
1182 WorklistRemover DeadNodes(*this);
1183 DAG.ReplaceAllUsesWith(N, To);
1185 // Push the new nodes and any users onto the worklist
1186 for (unsigned i = 0, e = NumTo; i != e; ++i) {
1187 if (To[i].getNode())
1188 AddToWorklistWithUsers(To[i].getNode());
1192 // Finally, if the node is now dead, remove it from the graph. The node
1193 // may not be dead if the replacement process recursively simplified to
1194 // something else needing this node.
1196 deleteAndRecombine(N);
1197 return SDValue(N, 0);
1201 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
1202 // Replace the old value with the new one.
1204 LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.dump(&DAG);
1205 dbgs() << "\nWith: "; TLO.New.dump(&DAG); dbgs() << '\n');
1207 // Replace all uses.
1208 DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
1210 // Push the new node and any (possibly new) users onto the worklist.
1211 AddToWorklistWithUsers(TLO.New.getNode());
1213 // Finally, if the node is now dead, remove it from the graph.
1214 recursivelyDeleteUnusedNodes(TLO.Old.getNode());
1217 /// Check the specified integer node value to see if it can be simplified or if
1218 /// things it uses can be simplified by bit propagation. If so, return true.
1219 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
1220 const APInt &DemandedElts,
1221 bool AssumeSingleUse) {
1222 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
1224 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
1228 // Revisit the node.
1229 AddToWorklist(Op.getNode());
1231 CommitTargetLoweringOpt(TLO);
1235 /// Check the specified vector node value to see if it can be simplified or
1236 /// if things it uses can be simplified as it only uses some of the elements.
1237 /// If so, return true.
1238 bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
1239 const APInt &DemandedElts,
1240 bool AssumeSingleUse) {
1241 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
1242 APInt KnownUndef, KnownZero;
1243 if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
1244 TLO, 0, AssumeSingleUse))
1247 // Revisit the node.
1248 AddToWorklist(Op.getNode());
1250 CommitTargetLoweringOpt(TLO);
1254 void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
1256 EVT VT = Load->getValueType(0);
1257 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
1259 LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
1260 Trunc.dump(&DAG); dbgs() << '\n');
1262 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
1263 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
1265 AddToWorklist(Trunc.getNode());
1266 recursivelyDeleteUnusedNodes(Load);
1269 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
1272 if (ISD::isUNINDEXEDLoad(Op.getNode())) {
1273 LoadSDNode *LD = cast<LoadSDNode>(Op);
1274 EVT MemVT = LD->getMemoryVT();
1275 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
1276 : LD->getExtensionType();
1278 return DAG.getExtLoad(ExtType, DL, PVT,
1279 LD->getChain(), LD->getBasePtr(),
1280 MemVT, LD->getMemOperand());
1283 unsigned Opc = Op.getOpcode();
1286 case ISD::AssertSext:
1287 if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
1288 return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
1290 case ISD::AssertZext:
1291 if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
1292 return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
1294 case ISD::Constant: {
1296 Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1297 return DAG.getNode(ExtOpc, DL, PVT, Op);
1301 if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
1303 return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
1306 SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
1307 if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
1309 EVT OldVT = Op.getValueType();
1311 bool Replace = false;
1312 SDValue NewOp = PromoteOperand(Op, PVT, Replace);
1313 if (!NewOp.getNode())
1315 AddToWorklist(NewOp.getNode());
1318 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
1319 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
1320 DAG.getValueType(OldVT));
1323 SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
1324 EVT OldVT = Op.getValueType();
1326 bool Replace = false;
1327 SDValue NewOp = PromoteOperand(Op, PVT, Replace);
1328 if (!NewOp.getNode())
1330 AddToWorklist(NewOp.getNode());
1333 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
1334 return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
1337 /// Promote the specified integer binary operation if the target indicates it is
1338 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to
1339 /// i32 since i16 instructions are longer.
1340 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
1341 if (!LegalOperations)
1344 EVT VT = Op.getValueType();
1345 if (VT.isVector() || !VT.isInteger())
1348 // If operation type is 'undesirable', e.g. i16 on x86, consider
1350 unsigned Opc = Op.getOpcode();
1351 if (TLI.isTypeDesirableForOp(Opc, VT))
1355 // Consult target whether it is a good idea to promote this operation and
1356 // what's the right type to promote it to.
1357 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1358 assert(PVT != VT && "Don't know what type to promote to!");
1360 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
1362 bool Replace0 = false;
1363 SDValue N0 = Op.getOperand(0);
1364 SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
1366 bool Replace1 = false;
1367 SDValue N1 = Op.getOperand(1);
1368 SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
1372 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
1374 // We are always replacing N0/N1's use in N and only need additional
1375 // replacements if there are additional uses.
1376 // Note: We are checking uses of the *nodes* (SDNode) rather than values
1377 // (SDValue) here because the node may reference multiple values
1378 // (for example, the chain value of a load node).
1379 Replace0 &= !N0->hasOneUse();
1380 Replace1 &= (N0 != N1) && !N1->hasOneUse();
1382 // Combine Op here so it is preserved past replacements.
1383 CombineTo(Op.getNode(), RV);
1385 // If operands have a use ordering, make sure we deal with
1386 // predecessor first.
1387 if (Replace0 && Replace1 && N0->isPredecessorOf(N1.getNode())) {
1389 std::swap(NN0, NN1);
1393 AddToWorklist(NN0.getNode());
1394 ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
1397 AddToWorklist(NN1.getNode());
1398 ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
1405 /// Promote the specified integer shift operation if the target indicates it is
1406 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to
1407 /// i32 since i16 instructions are longer.
1408 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
1409 if (!LegalOperations)
1412 EVT VT = Op.getValueType();
1413 if (VT.isVector() || !VT.isInteger())
1416 // If operation type is 'undesirable', e.g. i16 on x86, consider
1418 unsigned Opc = Op.getOpcode();
1419 if (TLI.isTypeDesirableForOp(Opc, VT))
1423 // Consult target whether it is a good idea to promote this operation and
1424 // what's the right type to promote it to.
1425 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1426 assert(PVT != VT && "Don't know what type to promote to!");
1428 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
1430 bool Replace = false;
1431 SDValue N0 = Op.getOperand(0);
1432 if (Opc == ISD::SRA)
1433 N0 = SExtPromoteOperand(N0, PVT);
1434 else if (Opc == ISD::SRL)
1435 N0 = ZExtPromoteOperand(N0, PVT);
1437 N0 = PromoteOperand(N0, PVT, Replace);
1443 SDValue N1 = Op.getOperand(1);
1445 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
1448 ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
1450 // Deal with Op being deleted.
1451 if (Op && Op.getOpcode() != ISD::DELETED_NODE)
1457 SDValue DAGCombiner::PromoteExtend(SDValue Op) {
1458 if (!LegalOperations)
1461 EVT VT = Op.getValueType();
1462 if (VT.isVector() || !VT.isInteger())
1465 // If operation type is 'undesirable', e.g. i16 on x86, consider
1467 unsigned Opc = Op.getOpcode();
1468 if (TLI.isTypeDesirableForOp(Opc, VT))
1472 // Consult target whether it is a good idea to promote this operation and
1473 // what's the right type to promote it to.
1474 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1475 assert(PVT != VT && "Don't know what type to promote to!");
1476 // fold (aext (aext x)) -> (aext x)
1477 // fold (aext (zext x)) -> (zext x)
1478 // fold (aext (sext x)) -> (sext x)
1479 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
1480 return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
1485 bool DAGCombiner::PromoteLoad(SDValue Op) {
1486 if (!LegalOperations)
1489 if (!ISD::isUNINDEXEDLoad(Op.getNode()))
1492 EVT VT = Op.getValueType();
1493 if (VT.isVector() || !VT.isInteger())
1496 // If operation type is 'undesirable', e.g. i16 on x86, consider
1498 unsigned Opc = Op.getOpcode();
1499 if (TLI.isTypeDesirableForOp(Opc, VT))
1503 // Consult target whether it is a good idea to promote this operation and
1504 // what's the right type to promote it to.
1505 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1506 assert(PVT != VT && "Don't know what type to promote to!");
1509 SDNode *N = Op.getNode();
1510 LoadSDNode *LD = cast<LoadSDNode>(N);
1511 EVT MemVT = LD->getMemoryVT();
1512 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
1513 : LD->getExtensionType();
1514 SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
1515 LD->getChain(), LD->getBasePtr(),
1516 MemVT, LD->getMemOperand());
1517 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
1519 LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
1520 Result.dump(&DAG); dbgs() << '\n');
1522 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1523 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
1525 AddToWorklist(Result.getNode());
1526 recursivelyDeleteUnusedNodes(N);
1533 /// Recursively delete a node which has no uses and any operands for
1534 /// which it is the only use.
1536 /// Note that this both deletes the nodes and removes them from the worklist.
1537 /// It also adds any nodes who have had a user deleted to the worklist as they
1538 /// may now have only one use and subject to other combines.
1539 bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
1540 if (!N->use_empty())
1543 SmallSetVector<SDNode *, 16> Nodes;
1546 N = Nodes.pop_back_val();
1550 if (N->use_empty()) {
1551 for (const SDValue &ChildN : N->op_values())
1552 Nodes.insert(ChildN.getNode());
1554 removeFromWorklist(N);
1559 } while (!Nodes.empty());
1563 //===----------------------------------------------------------------------===//
1564 // Main DAG Combiner implementation
1565 //===----------------------------------------------------------------------===//
1567 void DAGCombiner::Run(CombineLevel AtLevel) {
1568 // set the instance variables, so that the various visit routines may use it.
1570 LegalDAG = Level >= AfterLegalizeDAG;
1571 LegalOperations = Level >= AfterLegalizeVectorOps;
1572 LegalTypes = Level >= AfterLegalizeTypes;
1574 WorklistInserter AddNodes(*this);
1576 // Add all the dag nodes to the worklist.
1577 for (SDNode &Node : DAG.allnodes())
1578 AddToWorklist(&Node);
1580 // Create a dummy node (which is not added to allnodes), that adds a reference
1581 // to the root node, preventing it from being deleted, and tracking any
1582 // changes of the root.
1583 HandleSDNode Dummy(DAG.getRoot());
1585 // While we have a valid worklist entry node, try to combine it.
1586 while (SDNode *N = getNextWorklistEntry()) {
1587 // If N has no uses, it is dead. Make sure to revisit all N's operands once
1588 // N is deleted from the DAG, since they too may now be dead or may have a
1589 // reduced number of uses, allowing other xforms.
1590 if (recursivelyDeleteUnusedNodes(N))
1593 WorklistRemover DeadNodes(*this);
1595 // If this combine is running after legalizing the DAG, re-legalize any
1596 // nodes pulled off the worklist.
1598 SmallSetVector<SDNode *, 16> UpdatedNodes;
1599 bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
1601 for (SDNode *LN : UpdatedNodes)
1602 AddToWorklistWithUsers(LN);
1608 LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
1610 // Add any operands of the new node which have not yet been combined to the
1611 // worklist as well. Because the worklist uniques things already, this
1612 // won't repeatedly process the same operand.
1613 CombinedNodes.insert(N);
1614 for (const SDValue &ChildN : N->op_values())
1615 if (!CombinedNodes.count(ChildN.getNode()))
1616 AddToWorklist(ChildN.getNode());
1618 SDValue RV = combine(N);
1625 // If we get back the same node we passed in, rather than a new node or
1626 // zero, we know that the node must have defined multiple values and
1627 // CombineTo was used. Since CombineTo takes care of the worklist
1628 // mechanics for us, we have no work to do in this case.
1629 if (RV.getNode() == N)
1632 assert(N->getOpcode() != ISD::DELETED_NODE &&
1633 RV.getOpcode() != ISD::DELETED_NODE &&
1634 "Node was deleted but visit returned new node!");
1636 LLVM_DEBUG(dbgs() << " ... into: "; RV.dump(&DAG));
1638 if (N->getNumValues() == RV->getNumValues())
1639 DAG.ReplaceAllUsesWith(N, RV.getNode());
1641 assert(N->getValueType(0) == RV.getValueType() &&
1642 N->getNumValues() == 1 && "Type mismatch");
1643 DAG.ReplaceAllUsesWith(N, &RV);
1646 // Push the new node and any users onto the worklist. Omit this if the
1647 // new node is the EntryToken (e.g. if a store managed to get optimized
1648 // out), because re-visiting the EntryToken and its users will not uncover
1649 // any additional opportunities, but there may be a large number of such
1650 // users, potentially causing compile time explosion.
1651 if (RV.getOpcode() != ISD::EntryToken) {
1652 AddToWorklist(RV.getNode());
1653 AddUsersToWorklist(RV.getNode());
1656 // Finally, if the node is now dead, remove it from the graph. The node
1657 // may not be dead if the replacement process recursively simplified to
1658 // something else needing this node. This will also take care of adding any
1659 // operands which have lost a user to the worklist.
1660 recursivelyDeleteUnusedNodes(N);
1663 // If the root changed (e.g. it was a dead load, update the root).
1664 DAG.setRoot(Dummy.getValue());
1665 DAG.RemoveDeadNodes();
1668 SDValue DAGCombiner::visit(SDNode *N) {
1669 switch (N->getOpcode()) {
1671 case ISD::TokenFactor: return visitTokenFactor(N);
1672 case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
1673 case ISD::ADD: return visitADD(N);
1674 case ISD::SUB: return visitSUB(N);
1676 case ISD::UADDSAT: return visitADDSAT(N);
1678 case ISD::USUBSAT: return visitSUBSAT(N);
1679 case ISD::ADDC: return visitADDC(N);
1681 case ISD::UADDO: return visitADDO(N);
1682 case ISD::SUBC: return visitSUBC(N);
1684 case ISD::USUBO: return visitSUBO(N);
1685 case ISD::ADDE: return visitADDE(N);
1686 case ISD::ADDCARRY: return visitADDCARRY(N);
1687 case ISD::SADDO_CARRY: return visitSADDO_CARRY(N);
1688 case ISD::SUBE: return visitSUBE(N);
1689 case ISD::SUBCARRY: return visitSUBCARRY(N);
1690 case ISD::SSUBO_CARRY: return visitSSUBO_CARRY(N);
1692 case ISD::SMULFIXSAT:
1694 case ISD::UMULFIXSAT: return visitMULFIX(N);
1695 case ISD::MUL: return visitMUL(N);
1696 case ISD::SDIV: return visitSDIV(N);
1697 case ISD::UDIV: return visitUDIV(N);
1699 case ISD::UREM: return visitREM(N);
1700 case ISD::MULHU: return visitMULHU(N);
1701 case ISD::MULHS: return visitMULHS(N);
1702 case ISD::AVGFLOORS:
1703 case ISD::AVGFLOORU:
1705 case ISD::AVGCEILU: return visitAVG(N);
1706 case ISD::SMUL_LOHI: return visitSMUL_LOHI(N);
1707 case ISD::UMUL_LOHI: return visitUMUL_LOHI(N);
1709 case ISD::UMULO: return visitMULO(N);
1713 case ISD::UMAX: return visitIMINMAX(N);
1714 case ISD::AND: return visitAND(N);
1715 case ISD::OR: return visitOR(N);
1716 case ISD::XOR: return visitXOR(N);
1717 case ISD::SHL: return visitSHL(N);
1718 case ISD::SRA: return visitSRA(N);
1719 case ISD::SRL: return visitSRL(N);
1721 case ISD::ROTL: return visitRotate(N);
1723 case ISD::FSHR: return visitFunnelShift(N);
1725 case ISD::USHLSAT: return visitSHLSAT(N);
1726 case ISD::ABS: return visitABS(N);
1727 case ISD::BSWAP: return visitBSWAP(N);
1728 case ISD::BITREVERSE: return visitBITREVERSE(N);
1729 case ISD::CTLZ: return visitCTLZ(N);
1730 case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N);
1731 case ISD::CTTZ: return visitCTTZ(N);
1732 case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N);
1733 case ISD::CTPOP: return visitCTPOP(N);
1734 case ISD::SELECT: return visitSELECT(N);
1735 case ISD::VSELECT: return visitVSELECT(N);
1736 case ISD::SELECT_CC: return visitSELECT_CC(N);
1737 case ISD::SETCC: return visitSETCC(N);
1738 case ISD::SETCCCARRY: return visitSETCCCARRY(N);
1739 case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N);
1740 case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
1741 case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
1742 case ISD::AssertSext:
1743 case ISD::AssertZext: return visitAssertExt(N);
1744 case ISD::AssertAlign: return visitAssertAlign(N);
1745 case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
1746 case ISD::SIGN_EXTEND_VECTOR_INREG:
1747 case ISD::ZERO_EXTEND_VECTOR_INREG: return visitEXTEND_VECTOR_INREG(N);
1748 case ISD::TRUNCATE: return visitTRUNCATE(N);
1749 case ISD::BITCAST: return visitBITCAST(N);
1750 case ISD::BUILD_PAIR: return visitBUILD_PAIR(N);
1751 case ISD::FADD: return visitFADD(N);
1752 case ISD::STRICT_FADD: return visitSTRICT_FADD(N);
1753 case ISD::FSUB: return visitFSUB(N);
1754 case ISD::FMUL: return visitFMUL(N);
1755 case ISD::FMA: return visitFMA(N);
1756 case ISD::FDIV: return visitFDIV(N);
1757 case ISD::FREM: return visitFREM(N);
1758 case ISD::FSQRT: return visitFSQRT(N);
1759 case ISD::FCOPYSIGN: return visitFCOPYSIGN(N);
1760 case ISD::FPOW: return visitFPOW(N);
1761 case ISD::SINT_TO_FP: return visitSINT_TO_FP(N);
1762 case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
1763 case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
1764 case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
1765 case ISD::FP_ROUND: return visitFP_ROUND(N);
1766 case ISD::FP_EXTEND: return visitFP_EXTEND(N);
1767 case ISD::FNEG: return visitFNEG(N);
1768 case ISD::FABS: return visitFABS(N);
1769 case ISD::FFLOOR: return visitFFLOOR(N);
1773 case ISD::FMAXIMUM: return visitFMinMax(N);
1774 case ISD::FCEIL: return visitFCEIL(N);
1775 case ISD::FTRUNC: return visitFTRUNC(N);
1776 case ISD::BRCOND: return visitBRCOND(N);
1777 case ISD::BR_CC: return visitBR_CC(N);
1778 case ISD::LOAD: return visitLOAD(N);
1779 case ISD::STORE: return visitSTORE(N);
1780 case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N);
1781 case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
1782 case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N);
1783 case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N);
1784 case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
1785 case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
1786 case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N);
1787 case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N);
1788 case ISD::MGATHER: return visitMGATHER(N);
1789 case ISD::MLOAD: return visitMLOAD(N);
1790 case ISD::MSCATTER: return visitMSCATTER(N);
1791 case ISD::MSTORE: return visitMSTORE(N);
1792 case ISD::LIFETIME_END: return visitLIFETIME_END(N);
1793 case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
1794 case ISD::FP16_TO_FP: return visitFP16_TO_FP(N);
1795 case ISD::FP_TO_BF16: return visitFP_TO_BF16(N);
1796 case ISD::FREEZE: return visitFREEZE(N);
1797 case ISD::VECREDUCE_FADD:
1798 case ISD::VECREDUCE_FMUL:
1799 case ISD::VECREDUCE_ADD:
1800 case ISD::VECREDUCE_MUL:
1801 case ISD::VECREDUCE_AND:
1802 case ISD::VECREDUCE_OR:
1803 case ISD::VECREDUCE_XOR:
1804 case ISD::VECREDUCE_SMAX:
1805 case ISD::VECREDUCE_SMIN:
1806 case ISD::VECREDUCE_UMAX:
1807 case ISD::VECREDUCE_UMIN:
1808 case ISD::VECREDUCE_FMAX:
1809 case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N);
1810 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC:
1811 #include "llvm/IR/VPIntrinsics.def"
1812 return visitVPOp(N);
1817 SDValue DAGCombiner::combine(SDNode *N) {
1819 if (!DisableGenericCombines)
1822 // If nothing happened, try a target-specific DAG combine.
1823 if (!RV.getNode()) {
1824 assert(N->getOpcode() != ISD::DELETED_NODE &&
1825 "Node was deleted but visit returned NULL!");
1827 if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
1828 TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
1830 // Expose the DAG combiner to the target combiner impls.
1831 TargetLowering::DAGCombinerInfo
1832 DagCombineInfo(DAG, Level, false, this);
1834 RV = TLI.PerformDAGCombine(N, DagCombineInfo);
1838 // If nothing happened still, try promoting the operation.
1839 if (!RV.getNode()) {
1840 switch (N->getOpcode()) {
1848 RV = PromoteIntBinOp(SDValue(N, 0));
1853 RV = PromoteIntShiftOp(SDValue(N, 0));
1855 case ISD::SIGN_EXTEND:
1856 case ISD::ZERO_EXTEND:
1857 case ISD::ANY_EXTEND:
1858 RV = PromoteExtend(SDValue(N, 0));
1861 if (PromoteLoad(SDValue(N, 0)))
1867 // If N is a commutative binary node, try to eliminate it if the commuted
1868 // version is already present in the DAG.
1869 if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode())) {
1870 SDValue N0 = N->getOperand(0);
1871 SDValue N1 = N->getOperand(1);
1873 // Constant operands are canonicalized to RHS.
1874 if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) {
1875 SDValue Ops[] = {N1, N0};
1876 SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
1879 return SDValue(CSENode, 0);
1886 /// Given a node, return its input chain if it has one, otherwise return a null
1888 static SDValue getInputChainForNode(SDNode *N) {
1889 if (unsigned NumOps = N->getNumOperands()) {
1890 if (N->getOperand(0).getValueType() == MVT::Other)
1891 return N->getOperand(0);
1892 if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
1893 return N->getOperand(NumOps-1);
1894 for (unsigned i = 1; i < NumOps-1; ++i)
1895 if (N->getOperand(i).getValueType() == MVT::Other)
1896 return N->getOperand(i);
1901 SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
1902 // If N has two operands, where one has an input chain equal to the other,
1903 // the 'other' chain is redundant.
1904 if (N->getNumOperands() == 2) {
1905 if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
1906 return N->getOperand(0);
1907 if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
1908 return N->getOperand(1);
1911 // Don't simplify token factors if optnone.
1912 if (OptLevel == CodeGenOpt::None)
1915 // Don't simplify the token factor if the node itself has too many operands.
1916 if (N->getNumOperands() > TokenFactorInlineLimit)
1919 // If the sole user is a token factor, we should make sure we have a
1920 // chance to merge them together. This prevents TF chains from inhibiting
1922 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor)
1923 AddToWorklist(*(N->use_begin()));
1925 SmallVector<SDNode *, 8> TFs; // List of token factors to visit.
1926 SmallVector<SDValue, 8> Ops; // Ops for replacing token factor.
1927 SmallPtrSet<SDNode*, 16> SeenOps;
1928 bool Changed = false; // If we should replace this token factor.
1930 // Start out with this token factor.
1933 // Iterate through token factors. The TFs grows when new token factors are
1935 for (unsigned i = 0; i < TFs.size(); ++i) {
1936 // Limit number of nodes to inline, to avoid quadratic compile times.
1937 // We have to add the outstanding Token Factors to Ops, otherwise we might
1938 // drop Ops from the resulting Token Factors.
1939 if (Ops.size() > TokenFactorInlineLimit) {
1940 for (unsigned j = i; j < TFs.size(); j++)
1941 Ops.emplace_back(TFs[j], 0);
1942 // Drop unprocessed Token Factors from TFs, so we do not add them to the
1943 // combiner worklist later.
1948 SDNode *TF = TFs[i];
1949 // Check each of the operands.
1950 for (const SDValue &Op : TF->op_values()) {
1951 switch (Op.getOpcode()) {
1952 case ISD::EntryToken:
1953 // Entry tokens don't need to be added to the list. They are
1958 case ISD::TokenFactor:
1959 if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
1960 // Queue up for processing.
1961 TFs.push_back(Op.getNode());
1968 // Only add if it isn't already in the list.
1969 if (SeenOps.insert(Op.getNode()).second)
1978 // Re-visit inlined Token Factors, to clean them up in case they have been
1979 // removed. Skip the first Token Factor, as this is the current node.
1980 for (unsigned i = 1, e = TFs.size(); i < e; i++)
1981 AddToWorklist(TFs[i]);
1983 // Remove Nodes that are chained to another node in the list. Do so
1984 // by walking up chains breath-first stopping when we've seen
1985 // another operand. In general we must climb to the EntryNode, but we can exit
1986 // early if we find all remaining work is associated with just one operand as
1987 // no further pruning is possible.
1989 // List of nodes to search through and original Ops from which they originate.
1990 SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
1991 SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
1992 SmallPtrSet<SDNode *, 16> SeenChains;
1993 bool DidPruneOps = false;
1995 unsigned NumLeftToConsider = 0;
1996 for (const SDValue &Op : Ops) {
1997 Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
1998 OpWorkCount.push_back(1);
2001 auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
2002 // If this is an Op, we can remove the op from the list. Remark any
2003 // search associated with it as from the current OpNumber.
2004 if (SeenOps.contains(Op)) {
2007 unsigned OrigOpNumber = 0;
2008 while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
2010 assert((OrigOpNumber != Ops.size()) &&
2011 "expected to find TokenFactor Operand");
2012 // Re-mark worklist from OrigOpNumber to OpNumber
2013 for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
2014 if (Worklist[i].second == OrigOpNumber) {
2015 Worklist[i].second = OpNumber;
2018 OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
2019 OpWorkCount[OrigOpNumber] = 0;
2020 NumLeftToConsider--;
2022 // Add if it's a new chain
2023 if (SeenChains.insert(Op).second) {
2024 OpWorkCount[OpNumber]++;
2025 Worklist.push_back(std::make_pair(Op, OpNumber));
2029 for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
2030 // We need at least be consider at least 2 Ops to prune.
2031 if (NumLeftToConsider <= 1)
2033 auto CurNode = Worklist[i].first;
2034 auto CurOpNumber = Worklist[i].second;
2035 assert((OpWorkCount[CurOpNumber] > 0) &&
2036 "Node should not appear in worklist");
2037 switch (CurNode->getOpcode()) {
2038 case ISD::EntryToken:
2039 // Hitting EntryToken is the only way for the search to terminate without
2041 // another operand's search. Prevent us from marking this operand
2043 NumLeftToConsider++;
2045 case ISD::TokenFactor:
2046 for (const SDValue &Op : CurNode->op_values())
2047 AddToWorklist(i, Op.getNode(), CurOpNumber);
2049 case ISD::LIFETIME_START:
2050 case ISD::LIFETIME_END:
2051 case ISD::CopyFromReg:
2052 case ISD::CopyToReg:
2053 AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
2056 if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
2057 AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
2060 OpWorkCount[CurOpNumber]--;
2061 if (OpWorkCount[CurOpNumber] == 0)
2062 NumLeftToConsider--;
2065 // If we've changed things around then replace token factor.
2069 // The entry token is the only possible outcome.
2070 Result = DAG.getEntryNode();
2073 SmallVector<SDValue, 8> PrunedOps;
2075 for (const SDValue &Op : Ops) {
2076 if (SeenChains.count(Op.getNode()) == 0)
2077 PrunedOps.push_back(Op);
2079 Result = DAG.getTokenFactor(SDLoc(N), PrunedOps);
2081 Result = DAG.getTokenFactor(SDLoc(N), Ops);
2089 /// MERGE_VALUES can always be eliminated.
2090 SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
2091 WorklistRemover DeadNodes(*this);
2092 // Replacing results may cause a different MERGE_VALUES to suddenly
2093 // be CSE'd with N, and carry its uses with it. Iterate until no
2094 // uses remain, to ensure that the node can be safely deleted.
2095 // First add the users of this node to the work list so that they
2096 // can be tried again once they have new operands.
2097 AddUsersToWorklist(N);
2099 // Do as a single replacement to avoid rewalking use lists.
2100 SmallVector<SDValue, 8> Ops;
2101 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
2102 Ops.push_back(N->getOperand(i));
2103 DAG.ReplaceAllUsesWith(N, Ops.data());
2104 } while (!N->use_empty());
2105 deleteAndRecombine(N);
2106 return SDValue(N, 0); // Return N so it doesn't get rechecked!
2109 /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
2110 /// ConstantSDNode pointer else nullptr.
2111 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
2112 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
2113 return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
2116 /// Return true if 'Use' is a load or a store that uses N as its base pointer
2117 /// and that N may be folded in the load / store addressing mode.
2118 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
2119 const TargetLowering &TLI) {
2123 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
2124 if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
2126 VT = LD->getMemoryVT();
2127 AS = LD->getAddressSpace();
2128 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
2129 if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
2131 VT = ST->getMemoryVT();
2132 AS = ST->getAddressSpace();
2133 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
2134 if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
2136 VT = LD->getMemoryVT();
2137 AS = LD->getAddressSpace();
2138 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
2139 if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
2141 VT = ST->getMemoryVT();
2142 AS = ST->getAddressSpace();
2147 TargetLowering::AddrMode AM;
2148 if (N->getOpcode() == ISD::ADD) {
2149 AM.HasBaseReg = true;
2150 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
2153 AM.BaseOffs = Offset->getSExtValue();
2157 } else if (N->getOpcode() == ISD::SUB) {
2158 AM.HasBaseReg = true;
2159 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
2162 AM.BaseOffs = -Offset->getSExtValue();
2170 return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
2171 VT.getTypeForEVT(*DAG.getContext()), AS);
2174 /// This inverts a canonicalization in IR that replaces a variable select arm
2175 /// with an identity constant. Codegen improves if we re-use the variable
2176 /// operand rather than load a constant. This can also be converted into a
2177 /// masked vector operation if the target supports it.
2178 static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
2179 bool ShouldCommuteOperands) {
2180 // Match a select as operand 1. The identity constant that we are looking for
2181 // is only valid as operand 1 of a non-commutative binop.
2182 SDValue N0 = N->getOperand(0);
2183 SDValue N1 = N->getOperand(1);
2184 if (ShouldCommuteOperands)
2187 // TODO: Should this apply to scalar select too?
2188 if (N1.getOpcode() != ISD::VSELECT || !N1.hasOneUse())
2191 unsigned Opcode = N->getOpcode();
2192 EVT VT = N->getValueType(0);
2193 SDValue Cond = N1.getOperand(0);
2194 SDValue TVal = N1.getOperand(1);
2195 SDValue FVal = N1.getOperand(2);
2197 // TODO: The cases should match with IR's ConstantExpr::getBinOpIdentity().
2198 // TODO: Target-specific opcodes could be added. Ex: "isCommutativeBinOp()".
2199 // TODO: With fast-math (NSZ), allow the opposite-sign form of zero?
2200 auto isIdentityConstantForOpcode = [](unsigned Opcode, SDValue V) {
2201 if (ConstantFPSDNode *C = isConstOrConstSplatFP(V)) {
2203 case ISD::FADD: // X + -0.0 --> X
2204 return C->isZero() && C->isNegative();
2205 case ISD::FSUB: // X - 0.0 --> X
2206 return C->isZero() && !C->isNegative();
2207 case ISD::FMUL: // X * 1.0 --> X
2208 case ISD::FDIV: // X / 1.0 --> X
2209 return C->isExactlyValue(1.0);
2212 if (ConstantSDNode *C = isConstOrConstSplat(V)) {
2214 case ISD::ADD: // X + 0 --> X
2215 case ISD::SUB: // X - 0 --> X
2216 case ISD::SHL: // X << 0 --> X
2217 case ISD::SRA: // X s>> 0 --> X
2218 case ISD::SRL: // X u>> 0 --> X
2220 case ISD::MUL: // X * 1 --> X
2227 // This transform increases uses of N0, so freeze it to be safe.
2228 // binop N0, (vselect Cond, IDC, FVal) --> vselect Cond, N0, (binop N0, FVal)
2229 if (isIdentityConstantForOpcode(Opcode, TVal)) {
2230 SDValue F0 = DAG.getFreeze(N0);
2231 SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, FVal, N->getFlags());
2232 return DAG.getSelect(SDLoc(N), VT, Cond, F0, NewBO);
2234 // binop N0, (vselect Cond, TVal, IDC) --> vselect Cond, (binop N0, TVal), N0
2235 if (isIdentityConstantForOpcode(Opcode, FVal)) {
2236 SDValue F0 = DAG.getFreeze(N0);
2237 SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, TVal, N->getFlags());
2238 return DAG.getSelect(SDLoc(N), VT, Cond, NewBO, F0);
2244 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
2245 assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
2246 "Unexpected binary operator");
2248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2249 auto BinOpcode = BO->getOpcode();
2250 EVT VT = BO->getValueType(0);
2251 if (TLI.shouldFoldSelectWithIdentityConstant(BinOpcode, VT)) {
2252 if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, false))
2255 if (TLI.isCommutativeBinOp(BO->getOpcode()))
2256 if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, true))
2260 // Don't do this unless the old select is going away. We want to eliminate the
2261 // binary operator, not replace a binop with a select.
2262 // TODO: Handle ISD::SELECT_CC.
2263 unsigned SelOpNo = 0;
2264 SDValue Sel = BO->getOperand(0);
2265 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) {
2267 Sel = BO->getOperand(1);
2270 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
2273 SDValue CT = Sel.getOperand(1);
2274 if (!isConstantOrConstantVector(CT, true) &&
2275 !DAG.isConstantFPBuildVectorOrConstantFP(CT))
2278 SDValue CF = Sel.getOperand(2);
2279 if (!isConstantOrConstantVector(CF, true) &&
2280 !DAG.isConstantFPBuildVectorOrConstantFP(CF))
2283 // Bail out if any constants are opaque because we can't constant fold those.
2284 // The exception is "and" and "or" with either 0 or -1 in which case we can
2285 // propagate non constant operands into select. I.e.:
2286 // and (select Cond, 0, -1), X --> select Cond, 0, X
2287 // or X, (select Cond, -1, 0) --> select Cond, -1, X
2288 bool CanFoldNonConst =
2289 (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
2290 (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) &&
2291 (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF));
2293 SDValue CBO = BO->getOperand(SelOpNo ^ 1);
2294 if (!CanFoldNonConst &&
2295 !isConstantOrConstantVector(CBO, true) &&
2296 !DAG.isConstantFPBuildVectorOrConstantFP(CBO))
2299 // We have a select-of-constants followed by a binary operator with a
2300 // constant. Eliminate the binop by pulling the constant math into the select.
2301 // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
2303 SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT)
2304 : DAG.getNode(BinOpcode, DL, VT, CT, CBO);
2305 if (!CanFoldNonConst && !NewCT.isUndef() &&
2306 !isConstantOrConstantVector(NewCT, true) &&
2307 !DAG.isConstantFPBuildVectorOrConstantFP(NewCT))
2310 SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF)
2311 : DAG.getNode(BinOpcode, DL, VT, CF, CBO);
2312 if (!CanFoldNonConst && !NewCF.isUndef() &&
2313 !isConstantOrConstantVector(NewCF, true) &&
2314 !DAG.isConstantFPBuildVectorOrConstantFP(NewCF))
2317 SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
2318 SelectOp->setFlags(BO->getFlags());
2322 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
2323 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
2324 "Expecting add or sub");
2326 // Match a constant operand and a zext operand for the math instruction:
2329 bool IsAdd = N->getOpcode() == ISD::ADD;
2330 SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0);
2331 SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1);
2332 auto *CN = dyn_cast<ConstantSDNode>(C);
2333 if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND)
2336 // Match the zext operand as a setcc of a boolean.
2337 if (Z.getOperand(0).getOpcode() != ISD::SETCC ||
2338 Z.getOperand(0).getValueType() != MVT::i1)
2341 // Match the compare as: setcc (X & 1), 0, eq.
2342 SDValue SetCC = Z.getOperand(0);
2343 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
2344 if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) ||
2345 SetCC.getOperand(0).getOpcode() != ISD::AND ||
2346 !isOneConstant(SetCC.getOperand(0).getOperand(1)))
2349 // We are adding/subtracting a constant and an inverted low bit. Turn that
2350 // into a subtract/add of the low bit with incremented/decremented constant:
2351 // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1))
2352 // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1))
2353 EVT VT = C.getValueType();
2355 SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT);
2356 SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) :
2357 DAG.getConstant(CN->getAPIntValue() - 1, DL, VT);
2358 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
2361 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
2362 /// a shift and add with a different constant.
2363 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
2364 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
2365 "Expecting add or sub");
2367 // We need a constant operand for the add/sub, and the other operand is a
2368 // logical shift right: add (srl), C or sub C, (srl).
2369 bool IsAdd = N->getOpcode() == ISD::ADD;
2370 SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
2371 SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
2372 if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) ||
2373 ShiftOp.getOpcode() != ISD::SRL)
2376 // The shift must be of a 'not' value.
2377 SDValue Not = ShiftOp.getOperand(0);
2378 if (!Not.hasOneUse() || !isBitwiseNot(Not))
2381 // The shift must be moving the sign bit to the least-significant-bit.
2382 EVT VT = ShiftOp.getValueType();
2383 SDValue ShAmt = ShiftOp.getOperand(1);
2384 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
2385 if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1))
2388 // Eliminate the 'not' by adjusting the shift and add/sub constant:
2389 // add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
2390 // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
2392 if (SDValue NewC = DAG.FoldConstantArithmetic(
2393 IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
2394 {ConstantOp, DAG.getConstant(1, DL, VT)})) {
2395 SDValue NewShift = DAG.getNode(IsAdd ? ISD::SRA : ISD::SRL, DL, VT,
2396 Not.getOperand(0), ShAmt);
2397 return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC);
2403 static bool isADDLike(SDValue V, const SelectionDAG &DAG) {
2404 unsigned Opcode = V.getOpcode();
2405 if (Opcode == ISD::OR)
2406 return DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1));
2407 if (Opcode == ISD::XOR)
2408 return isMinSignedConstant(V.getOperand(1));
2412 /// Try to fold a node that behaves like an ADD (note that N isn't necessarily
2413 /// an ISD::ADD here, it could for example be an ISD::OR if we know that there
2414 /// are no common bits set in the operands).
2415 SDValue DAGCombiner::visitADDLike(SDNode *N) {
2416 SDValue N0 = N->getOperand(0);
2417 SDValue N1 = N->getOperand(1);
2418 EVT VT = N0.getValueType();
2421 // fold (add x, undef) -> undef
2427 // fold (add c1, c2) -> c1+c2
2428 if (SDValue C = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}))
2431 // canonicalize constant to RHS
2432 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
2433 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
2434 return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
2437 if (VT.isVector()) {
2438 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
2441 // fold (add x, 0) -> x, vector edition
2442 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
2446 // fold (add x, 0) -> x
2447 if (isNullConstant(N1))
2450 if (N0.getOpcode() == ISD::SUB) {
2451 SDValue N00 = N0.getOperand(0);
2452 SDValue N01 = N0.getOperand(1);
2454 // fold ((A-c1)+c2) -> (A+(c2-c1))
2455 if (SDValue Sub = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N01}))
2456 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);
2458 // fold ((c1-A)+c2) -> (c1+c2)-A
2459 if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N00}))
2460 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
2463 // add (sext i1 X), 1 -> zext (not i1 X)
2464 // We don't transform this pattern:
2465 // add (zext i1 X), -1 -> sext (not i1 X)
2466 // because most (?) targets generate better code for the zext form.
2467 if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
2468 isOneOrOneSplat(N1)) {
2469 SDValue X = N0.getOperand(0);
2470 if ((!LegalOperations ||
2471 (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
2472 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
2473 X.getScalarValueSizeInBits() == 1) {
2474 SDValue Not = DAG.getNOT(DL, X, X.getValueType());
2475 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
2479 // Fold (add (or x, c0), c1) -> (add x, (c0 + c1))
2480 // iff (or x, c0) is equivalent to (add x, c0).
2481 // Fold (add (xor x, c0), c1) -> (add x, (c0 + c1))
2482 // iff (xor x, c0) is equivalent to (add x, c0).
2483 if (isADDLike(N0, DAG)) {
2484 SDValue N01 = N0.getOperand(1);
2485 if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N01}))
2486 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add);
2489 if (SDValue NewSel = foldBinOpIntoSelect(N))
2493 if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N, N0, N1)) {
2494 if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
2497 // Reassociate (add (or x, c), y) -> (add add(x, y), c)) if (or x, c) is
2498 // equivalent to (add x, c).
2499 // Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is
2500 // equivalent to (add x, c).
2501 auto ReassociateAddOr = [&](SDValue N0, SDValue N1) {
2502 if (isADDLike(N0, DAG) && N0.hasOneUse() &&
2503 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
2504 return DAG.getNode(ISD::ADD, DL, VT,
2505 DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
2510 if (SDValue Add = ReassociateAddOr(N0, N1))
2512 if (SDValue Add = ReassociateAddOr(N1, N0))
2515 // fold ((0-A) + B) -> B-A
2516 if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
2517 return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
2519 // fold (A + (0-B)) -> A-B
2520 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
2521 return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));
2523 // fold (A+(B-A)) -> B
2524 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
2525 return N1.getOperand(0);
2527 // fold ((B-A)+A) -> B
2528 if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
2529 return N0.getOperand(0);
2531 // fold ((A-B)+(C-A)) -> (C-B)
2532 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
2533 N0.getOperand(0) == N1.getOperand(1))
2534 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
2537 // fold ((A-B)+(B-C)) -> (A-C)
2538 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
2539 N0.getOperand(1) == N1.getOperand(0))
2540 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
2543 // fold (A+(B-(A+C))) to (B-C)
2544 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
2545 N0 == N1.getOperand(1).getOperand(0))
2546 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
2547 N1.getOperand(1).getOperand(1));
2549 // fold (A+(B-(C+A))) to (B-C)
2550 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
2551 N0 == N1.getOperand(1).getOperand(1))
2552 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
2553 N1.getOperand(1).getOperand(0));
2555 // fold (A+((B-A)+or-C)) to (B+or-C)
2556 if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
2557 N1.getOperand(0).getOpcode() == ISD::SUB &&
2558 N0 == N1.getOperand(0).getOperand(1))
2559 return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
2562 // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
2563 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
2564 N0->hasOneUse() && N1->hasOneUse()) {
2565 SDValue N00 = N0.getOperand(0);
2566 SDValue N01 = N0.getOperand(1);
2567 SDValue N10 = N1.getOperand(0);
2568 SDValue N11 = N1.getOperand(1);
2570 if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10))
2571 return DAG.getNode(ISD::SUB, DL, VT,
2572 DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
2573 DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
2576 // fold (add (umax X, C), -C) --> (usubsat X, C)
2577 if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) {
2578 auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
2579 return (!Max && !Op) ||
2580 (Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue()));
2582 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT,
2583 /*AllowUndefs*/ true))
2584 return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0),
2588 if (SimplifyDemandedBits(SDValue(N, 0)))
2589 return SDValue(N, 0);
2591 if (isOneOrOneSplat(N1)) {
2592 // fold (add (xor a, -1), 1) -> (sub 0, a)
2593 if (isBitwiseNot(N0))
2594 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
2597 // fold (add (add (xor a, -1), b), 1) -> (sub b, a)
2598 if (N0.getOpcode() == ISD::ADD) {
2601 if (isBitwiseNot(N0.getOperand(0))) {
2602 A = N0.getOperand(1);
2603 Xor = N0.getOperand(0);
2604 } else if (isBitwiseNot(N0.getOperand(1))) {
2605 A = N0.getOperand(0);
2606 Xor = N0.getOperand(1);
2610 return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0));
2614 // add (add x, y), 1
2615 // And if the target does not like this form then turn into:
2616 // sub y, (xor x, -1)
2617 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
2619 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
2620 DAG.getAllOnesConstant(DL, VT));
2621 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
2625 // (x - y) + -1 -> add (xor y, -1), x
2626 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
2627 isAllOnesOrAllOnesSplat(N1)) {
2628 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1);
2629 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
2632 if (SDValue Combined = visitADDLikeCommutative(N0, N1, N))
2635 if (SDValue Combined = visitADDLikeCommutative(N1, N0, N))
2641 SDValue DAGCombiner::visitADD(SDNode *N) {
2642 SDValue N0 = N->getOperand(0);
2643 SDValue N1 = N->getOperand(1);
2644 EVT VT = N0.getValueType();
2647 if (SDValue Combined = visitADDLike(N))
2650 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
2653 if (SDValue V = foldAddSubOfSignBit(N, DAG))
2656 // fold (a+b) -> (a|b) iff a and b share no bits.
2657 if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
2658 DAG.haveNoCommonBitsSet(N0, N1))
2659 return DAG.getNode(ISD::OR, DL, VT, N0, N1);
2661 // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
2662 if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {
2663 const APInt &C0 = N0->getConstantOperandAPInt(0);
2664 const APInt &C1 = N1->getConstantOperandAPInt(0);
2665 return DAG.getVScale(DL, VT, C0 + C1);
2668 // fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2)
2669 if ((N0.getOpcode() == ISD::ADD) &&
2670 (N0.getOperand(1).getOpcode() == ISD::VSCALE) &&
2671 (N1.getOpcode() == ISD::VSCALE)) {
2672 const APInt &VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
2673 const APInt &VS1 = N1->getConstantOperandAPInt(0);
2674 SDValue VS = DAG.getVScale(DL, VT, VS0 + VS1);
2675 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);
2678 // Fold (add step_vector(c1), step_vector(c2) to step_vector(c1+c2))
2679 if (N0.getOpcode() == ISD::STEP_VECTOR &&
2680 N1.getOpcode() == ISD::STEP_VECTOR) {
2681 const APInt &C0 = N0->getConstantOperandAPInt(0);
2682 const APInt &C1 = N1->getConstantOperandAPInt(0);
2683 APInt NewStep = C0 + C1;
2684 return DAG.getStepVector(DL, VT, NewStep);
2687 // Fold a + step_vector(c1) + step_vector(c2) to a + step_vector(c1+c2)
2688 if ((N0.getOpcode() == ISD::ADD) &&
2689 (N0.getOperand(1).getOpcode() == ISD::STEP_VECTOR) &&
2690 (N1.getOpcode() == ISD::STEP_VECTOR)) {
2691 const APInt &SV0 = N0.getOperand(1)->getConstantOperandAPInt(0);
2692 const APInt &SV1 = N1->getConstantOperandAPInt(0);
2693 APInt NewStep = SV0 + SV1;
2694 SDValue SV = DAG.getStepVector(DL, VT, NewStep);
2695 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), SV);
2701 SDValue DAGCombiner::visitADDSAT(SDNode *N) {
2702 unsigned Opcode = N->getOpcode();
2703 SDValue N0 = N->getOperand(0);
2704 SDValue N1 = N->getOperand(1);
2705 EVT VT = N0.getValueType();
2708 // fold (add_sat x, undef) -> -1
2709 if (N0.isUndef() || N1.isUndef())
2710 return DAG.getAllOnesConstant(DL, VT);
2712 // fold (add_sat c1, c2) -> c3
2713 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
2716 // canonicalize constant to RHS
2717 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
2718 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
2719 return DAG.getNode(Opcode, DL, VT, N1, N0);
2722 if (VT.isVector()) {
2723 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
2726 // fold (add_sat x, 0) -> x, vector edition
2727 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
2731 // fold (add_sat x, 0) -> x
2732 if (isNullConstant(N1))
2735 // If it cannot overflow, transform into an add.
2736 if (Opcode == ISD::UADDSAT)
2737 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
2738 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
2743 static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
2744 bool Masked = false;
2746 // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
2748 if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) {
2749 V = V.getOperand(0);
2753 if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
2755 V = V.getOperand(0);
2762 // If this is not a carry, return.
2763 if (V.getResNo() != 1)
2766 if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
2767 V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
2770 EVT VT = V->getValueType(0);
2771 if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT))
2774 // If the result is masked, then no matter what kind of bool it is we can
2775 // return. If it isn't, then we need to make sure the bool type is either 0 or
2776 // 1 and not other values.
2778 TLI.getBooleanContents(V.getValueType()) ==
2779 TargetLoweringBase::ZeroOrOneBooleanContent)
2785 /// Given the operands of an add/sub operation, see if the 2nd operand is a
2786 /// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert
2787 /// the opcode and bypass the mask operation.
2788 static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1,
2789 SelectionDAG &DAG, const SDLoc &DL) {
2790 if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1)))
2793 EVT VT = N0.getValueType();
2794 if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits())
2797 // add N0, (and (AssertSext X, i1), 1) --> sub N0, X
2798 // sub N0, (and (AssertSext X, i1), 1) --> add N0, X
2799 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0));
2802 /// Helper for doing combines based on N0 and N1 being added to each other.
2803 SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
2804 SDNode *LocReference) {
2805 EVT VT = N0.getValueType();
2806 SDLoc DL(LocReference);
2808 // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
2809 if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
2810 isNullOrNullSplat(N1.getOperand(0).getOperand(0)))
2811 return DAG.getNode(ISD::SUB, DL, VT, N0,
2812 DAG.getNode(ISD::SHL, DL, VT,
2813 N1.getOperand(0).getOperand(1),
2816 if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL))
2820 // add (add x, 1), y
2821 // And if the target does not like this form then turn into:
2822 // sub y, (xor x, -1)
2823 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
2824 N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1))) {
2825 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
2826 DAG.getAllOnesConstant(DL, VT));
2827 return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
2830 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse()) {
2831 // Hoist one-use subtraction by non-opaque constant:
2832 // (x - C) + y -> (x + y) - C
2833 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
2834 if (isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
2835 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1);
2836 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
2838 // Hoist one-use subtraction from non-opaque constant:
2839 // (C - x) + y -> (y - x) + C
2840 if (isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
2841 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
2842 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0));
2846 // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1'
2847 // rather than 'add 0/-1' (the zext should get folded).
2848 // add (sext i1 Y), X --> sub X, (zext i1 Y)
2849 if (N0.getOpcode() == ISD::SIGN_EXTEND &&
2850 N0.getOperand(0).getScalarValueSizeInBits() == 1 &&
2851 TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) {
2852 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
2853 return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
2856 // add X, (sextinreg Y i1) -> sub X, (and Y 1)
2857 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
2858 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
2859 if (TN->getVT() == MVT::i1) {
2860 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
2861 DAG.getConstant(1, DL, VT));
2862 return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
2866 // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
2867 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
2869 return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
2870 N0, N1.getOperand(0), N1.getOperand(2));
2872 // (add X, Carry) -> (addcarry X, 0, Carry)
2873 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
2874 if (SDValue Carry = getAsCarry(TLI, N1))
2875 return DAG.getNode(ISD::ADDCARRY, DL,
2876 DAG.getVTList(VT, Carry.getValueType()), N0,
2877 DAG.getConstant(0, DL, VT), Carry);
2882 SDValue DAGCombiner::visitADDC(SDNode *N) {
2883 SDValue N0 = N->getOperand(0);
2884 SDValue N1 = N->getOperand(1);
2885 EVT VT = N0.getValueType();
2888 // If the flag result is dead, turn this into an ADD.
2889 if (!N->hasAnyUseOfValue(1))
2890 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2891 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
2893 // canonicalize constant to RHS.
2894 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
2895 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
2897 return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
2899 // fold (addc x, 0) -> x + no carry out
2900 if (isNullConstant(N1))
2901 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
2904 // If it cannot overflow, transform into an add.
2905 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
2906 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2907 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
2913 * Flips a boolean if it is cheaper to compute. If the Force parameters is set,
2914 * then the flip also occurs if computing the inverse is the same cost.
2915 * This function returns an empty SDValue in case it cannot flip the boolean
2916 * without increasing the cost of the computation. If you want to flip a boolean
2917 * no matter what, use DAG.getLogicalNOT.
2919 static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
2920 const TargetLowering &TLI,
2922 if (Force && isa<ConstantSDNode>(V))
2923 return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
2925 if (V.getOpcode() != ISD::XOR)
2928 ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false);
2932 EVT VT = V.getValueType();
2934 bool IsFlip = false;
2935 switch(TLI.getBooleanContents(VT)) {
2936 case TargetLowering::ZeroOrOneBooleanContent:
2937 IsFlip = Const->isOne();
2939 case TargetLowering::ZeroOrNegativeOneBooleanContent:
2940 IsFlip = Const->isAllOnes();
2942 case TargetLowering::UndefinedBooleanContent:
2943 IsFlip = (Const->getAPIntValue() & 0x01) == 1;
2948 return V.getOperand(0);
2950 return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
2954 SDValue DAGCombiner::visitADDO(SDNode *N) {
2955 SDValue N0 = N->getOperand(0);
2956 SDValue N1 = N->getOperand(1);
2957 EVT VT = N0.getValueType();
2958 bool IsSigned = (ISD::SADDO == N->getOpcode());
2960 EVT CarryVT = N->getValueType(1);
2963 // If the flag result is dead, turn this into an ADD.
2964 if (!N->hasAnyUseOfValue(1))
2965 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2966 DAG.getUNDEF(CarryVT));
2968 // canonicalize constant to RHS.
2969 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
2970 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
2971 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
2973 // fold (addo x, 0) -> x + no carry out
2974 if (isNullOrNullSplat(N1))
2975 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
2978 // If it cannot overflow, transform into an add.
2979 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
2980 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2981 DAG.getConstant(0, DL, CarryVT));
2983 // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
2984 if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) {
2985 SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
2986 DAG.getConstant(0, DL, VT), N0.getOperand(0));
2988 N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
2991 if (SDValue Combined = visitUADDOLike(N0, N1, N))
2994 if (SDValue Combined = visitUADDOLike(N1, N0, N))
3001 SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
3002 EVT VT = N0.getValueType();
3006 // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
3007 // If Y + 1 cannot overflow.
3008 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
3009 SDValue Y = N1.getOperand(0);
3010 SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
3011 if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
3012 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
3016 // (uaddo X, Carry) -> (addcarry X, 0, Carry)
3017 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
3018 if (SDValue Carry = getAsCarry(TLI, N1))
3019 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
3020 DAG.getConstant(0, SDLoc(N), VT), Carry);
3025 SDValue DAGCombiner::visitADDE(SDNode *N) {
3026 SDValue N0 = N->getOperand(0);
3027 SDValue N1 = N->getOperand(1);
3028 SDValue CarryIn = N->getOperand(2);
3030 // canonicalize constant to RHS
3031 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
3032 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
3034 return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
3037 // fold (adde x, y, false) -> (addc x, y)
3038 if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
3039 return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);
3044 SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
3045 SDValue N0 = N->getOperand(0);
3046 SDValue N1 = N->getOperand(1);
3047 SDValue CarryIn = N->getOperand(2);
3050 // canonicalize constant to RHS
3051 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
3052 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
3054 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
3056 // fold (addcarry x, y, false) -> (uaddo x, y)
3057 if (isNullConstant(CarryIn)) {
3058 if (!LegalOperations ||
3059 TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
3060 return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
3063 // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
3064 if (isNullConstant(N0) && isNullConstant(N1)) {
3065 EVT VT = N0.getValueType();
3066 EVT CarryVT = CarryIn.getValueType();
3067 SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
3068 AddToWorklist(CarryExt.getNode());
3069 return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
3070 DAG.getConstant(1, DL, VT)),
3071 DAG.getConstant(0, DL, CarryVT));
3074 if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
3077 if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
3083 SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) {
3084 SDValue N0 = N->getOperand(0);
3085 SDValue N1 = N->getOperand(1);
3086 SDValue CarryIn = N->getOperand(2);
3089 // canonicalize constant to RHS
3090 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
3091 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
3093 return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn);
3095 // fold (saddo_carry x, y, false) -> (saddo x, y)
3096 if (isNullConstant(CarryIn)) {
3097 if (!LegalOperations ||
3098 TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0)))
3099 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1);
3106 * If we are facing some sort of diamond carry propapagtion pattern try to
3107 * break it up to generate something like:
3108 * (addcarry X, 0, (addcarry A, B, Z):Carry)
3110 * The end result is usually an increase in operation required, but because the
3111 * carry is now linearized, other tranforms can kick in and optimize the DAG.
3113 * Patterns typically look something like
3118 * | (addcarry *, 0, Z)
3122 * (addcarry X, *, *)
3124 * But numerous variation exist. Our goal is to identify A, B, X and Z and
3125 * produce a combine with a single path for carry propagation.
3127 static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
3128 SDValue X, SDValue Carry0, SDValue Carry1,
3130 if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1)
3132 if (Carry1.getOpcode() != ISD::UADDO)
3138 * First look for a suitable Z. It will present itself in the form of
3139 * (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true
3141 if (Carry0.getOpcode() == ISD::ADDCARRY &&
3142 isNullConstant(Carry0.getOperand(1))) {
3143 Z = Carry0.getOperand(2);
3144 } else if (Carry0.getOpcode() == ISD::UADDO &&
3145 isOneConstant(Carry0.getOperand(1))) {
3146 EVT VT = Combiner.getSetCCResultType(Carry0.getValueType());
3147 Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT);
3149 // We couldn't find a suitable Z.
3154 auto cancelDiamond = [&](SDValue A,SDValue B) {
3156 SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z);
3157 Combiner.AddToWorklist(NewY.getNode());
3158 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X,
3159 DAG.getConstant(0, DL, X.getValueType()),
3168 * (addcarry *, 0, Z)
3170 if (Carry0.getOperand(0) == Carry1.getValue(0)) {
3171 return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1));
3175 * (addcarry A, 0, Z)
3181 if (Carry1.getOperand(0) == Carry0.getValue(0)) {
3182 return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1));
3185 if (Carry1.getOperand(1) == Carry0.getValue(0)) {
3186 return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0));
3192 // If we are facing some sort of diamond carry/borrow in/out pattern try to
3193 // match patterns like:
3195 // (uaddo A, B) CarryIn
3198 // PartialSum PartialCarryOutX /
3200 // | ____|____________/
3202 // (uaddo *, *) \________
3205 // | PartialCarryOutY |
3208 // AddCarrySum | ______/
3210 // CarryOut = (or *, *)
3212 // And generate ADDCARRY (or SUBCARRY) with two result values:
3214 // {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn)
3216 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
3217 // a single path for carry/borrow out propagation:
3218 static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
3219 SDValue N0, SDValue N1, SDNode *N) {
3220 SDValue Carry0 = getAsCarry(TLI, N0);
3223 SDValue Carry1 = getAsCarry(TLI, N1);
3227 unsigned Opcode = Carry0.getOpcode();
3228 if (Opcode != Carry1.getOpcode())
3230 if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
3233 // Canonicalize the add/sub of A and B (the top node in the above ASCII art)
3234 // as Carry0 and the add/sub of the carry in as Carry1 (the middle node).
3235 if (Carry1.getNode()->isOperandOf(Carry0.getNode()))
3236 std::swap(Carry0, Carry1);
3238 // Check if nodes are connected in expected way.
3239 if (Carry1.getOperand(0) != Carry0.getValue(0) &&
3240 Carry1.getOperand(1) != Carry0.getValue(0))
3243 // The carry in value must be on the righthand side for subtraction.
3244 unsigned CarryInOperandNum =
3245 Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0;
3246 if (Opcode == ISD::USUBO && CarryInOperandNum != 1)
3248 SDValue CarryIn = Carry1.getOperand(CarryInOperandNum);
3250 unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
3251 if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType()))
3254 // Verify that the carry/borrow in is plausibly a carry/borrow bit.
3255 // TODO: make getAsCarry() aware of how partial carries are merged.
3256 if (CarryIn.getOpcode() != ISD::ZERO_EXTEND)
3258 CarryIn = CarryIn.getOperand(0);
3259 if (CarryIn.getValueType() != MVT::i1)
3264 DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0),
3265 Carry0.getOperand(1), CarryIn);
3267 // Please note that because we have proven that the result of the UADDO/USUBO
3268 // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can
3269 // therefore prove that if the first UADDO/USUBO overflows, the second
3270 // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the
3273 // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry
3274 // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow)
3276 // This is important because it means that OR and XOR can be used to merge
3277 // carry flags; and that AND can return a constant zero.
3279 // TODO: match other operations that can merge flags (ADD, etc)
3280 DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
3281 if (N->getOpcode() == ISD::AND)
3282 return DAG.getConstant(0, DL, MVT::i1);
3283 return Merged.getValue(1);
3286 SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
3288 // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry.
3289 if (isBitwiseNot(N0))
3290 if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) {
3292 SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1,
3293 N0.getOperand(0), NotC);
3295 N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
3298 // Iff the flag result is dead:
3299 // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
3300 // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo
3301 // or the dependency between the instructions.
3302 if ((N0.getOpcode() == ISD::ADD ||
3303 (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 &&
3304 N0.getValue(1) != CarryIn)) &&
3305 isNullConstant(N1) && !N->hasAnyUseOfValue(1))
3306 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
3307 N0.getOperand(0), N0.getOperand(1), CarryIn);
3310 * When one of the addcarry argument is itself a carry, we may be facing
3311 * a diamond carry propagation. In which case we try to transform the DAG
3312 * to ensure linear carry propagation if that is possible.
3314 if (auto Y = getAsCarry(TLI, N1)) {
3315 // Because both are carries, Y and Z can be swapped.
3316 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N))
3318 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N))
3325 // Attempt to create a USUBSAT(LHS, RHS) node with DstVT, performing a
3326 // clamp/truncation if necessary.
3327 static SDValue getTruncatedUSUBSAT(EVT DstVT, EVT SrcVT, SDValue LHS,
3328 SDValue RHS, SelectionDAG &DAG,
3330 assert(DstVT.getScalarSizeInBits() <= SrcVT.getScalarSizeInBits() &&
3331 "Illegal truncation");
3334 return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS);
3336 // If the LHS is zero-extended then we can perform the USUBSAT as DstVT by
3338 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
3339 DstVT.getScalarSizeInBits());
3340 if (!DAG.MaskedValueIsZero(LHS, UpperBits))
3344 DAG.getConstant(APInt::getLowBitsSet(SrcVT.getScalarSizeInBits(),
3345 DstVT.getScalarSizeInBits()),
3347 RHS = DAG.getNode(ISD::UMIN, DL, SrcVT, RHS, SatLimit);
3348 RHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, RHS);
3349 LHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, LHS);
3350 return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS);
3353 // Try to find umax(a,b) - b or a - umin(a,b) patterns that may be converted to
3354 // usubsat(a,b), optionally as a truncated type.
3355 SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
3356 if (N->getOpcode() != ISD::SUB ||
3357 !(!LegalOperations || hasOperation(ISD::USUBSAT, DstVT)))
3360 EVT SubVT = N->getValueType(0);
3361 SDValue Op0 = N->getOperand(0);
3362 SDValue Op1 = N->getOperand(1);
3364 // Try to find umax(a,b) - b or a - umin(a,b) patterns
3365 // they may be converted to usubsat(a,b).
3366 if (Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
3367 SDValue MaxLHS = Op0.getOperand(0);
3368 SDValue MaxRHS = Op0.getOperand(1);
3370 return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, SDLoc(N));
3372 return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, SDLoc(N));
3375 if (Op1.getOpcode() == ISD::UMIN && Op1.hasOneUse()) {
3376 SDValue MinLHS = Op1.getOperand(0);
3377 SDValue MinRHS = Op1.getOperand(1);
3379 return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, SDLoc(N));
3381 return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, SDLoc(N));
3384 // sub(a,trunc(umin(zext(a),b))) -> usubsat(a,trunc(umin(b,SatLimit)))
3385 if (Op1.getOpcode() == ISD::TRUNCATE &&
3386 Op1.getOperand(0).getOpcode() == ISD::UMIN &&
3387 Op1.getOperand(0).hasOneUse()) {
3388 SDValue MinLHS = Op1.getOperand(0).getOperand(0);
3389 SDValue MinRHS = Op1.getOperand(0).getOperand(1);
3390 if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && MinLHS.getOperand(0) == Op0)
3391 return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinLHS, MinRHS,
3393 if (MinRHS.getOpcode() == ISD::ZERO_EXTEND && MinRHS.getOperand(0) == Op0)
3394 return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinRHS, MinLHS,
3401 // Since it may not be valid to emit a fold to zero for vector initializers
3402 // check if we can before folding.
3403 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
3404 SelectionDAG &DAG, bool LegalOperations) {
3406 return DAG.getConstant(0, DL, VT);
3407 if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
3408 return DAG.getConstant(0, DL, VT);
3412 SDValue DAGCombiner::visitSUB(SDNode *N) {
3413 SDValue N0 = N->getOperand(0);
3414 SDValue N1 = N->getOperand(1);
3415 EVT VT = N0.getValueType();
3418 auto PeekThroughFreeze = [](SDValue N) {
3419 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
3420 return N->getOperand(0);
3424 // fold (sub x, x) -> 0
3425 // FIXME: Refactor this and xor and other similar operations together.
3426 if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
3427 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
3429 // fold (sub c1, c2) -> c3
3430 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
3434 if (VT.isVector()) {
3435 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
3438 // fold (sub x, 0) -> x, vector edition
3439 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
3443 if (SDValue NewSel = foldBinOpIntoSelect(N))
3446 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
3448 // fold (sub x, c) -> (add x, -c)
3450 return DAG.getNode(ISD::ADD, DL, VT, N0,
3451 DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
3454 if (isNullOrNullSplat(N0)) {
3455 unsigned BitWidth = VT.getScalarSizeInBits();
3456 // Right-shifting everything out but the sign bit followed by negation is
3457 // the same as flipping arithmetic/logical shift type without the negation:
3458 // -(X >>u 31) -> (X >>s 31)
3459 // -(X >>s 31) -> (X >>u 31)
3460 if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) {
3461 ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
3462 if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) {
3463 auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
3464 if (!LegalOperations || TLI.isOperationLegal(NewSh, VT))
3465 return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
3469 // 0 - X --> 0 if the sub is NUW.
3470 if (N->getFlags().hasNoUnsignedWrap())
3473 if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
3474 // N1 is either 0 or the minimum signed value. If the sub is NSW, then
3475 // N1 must be 0 because negating the minimum signed value is undefined.
3476 if (N->getFlags().hasNoSignedWrap())
3479 // 0 - X --> X if X is 0 or the minimum signed value.
3483 // Convert 0 - abs(x).
3484 if (N1.getOpcode() == ISD::ABS && N1.hasOneUse() &&
3485 !TLI.isOperationLegalOrCustom(ISD::ABS, VT))
3486 if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true))
3489 // Fold neg(splat(neg(x)) -> splat(x)
3490 if (VT.isVector()) {
3491 SDValue N1S = DAG.getSplatValue(N1, true);
3492 if (N1S && N1S.getOpcode() == ISD::SUB &&
3493 isNullConstant(N1S.getOperand(0))) {
3494 if (VT.isScalableVector())
3495 return DAG.getSplatVector(VT, DL, N1S.getOperand(1));
3496 return DAG.getSplatBuildVector(VT, DL, N1S.getOperand(1));
3501 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
3502 if (isAllOnesOrAllOnesSplat(N0))
3503 return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
3505 // fold (A - (0-B)) -> A+B
3506 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
3507 return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
3509 // fold A-(A-B) -> B
3510 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
3511 return N1.getOperand(1);
3513 // fold (A+B)-A -> B
3514 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
3515 return N0.getOperand(1);
3517 // fold (A+B)-B -> A
3518 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
3519 return N0.getOperand(0);
3521 // fold (A+C1)-C2 -> A+(C1-C2)
3522 if (N0.getOpcode() == ISD::ADD) {
3523 SDValue N01 = N0.getOperand(1);
3524 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N01, N1}))
3525 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
3528 // fold C2-(A+C1) -> (C2-C1)-A
3529 if (N1.getOpcode() == ISD::ADD) {
3530 SDValue N11 = N1.getOperand(1);
3531 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11}))
3532 return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
3535 // fold (A-C1)-C2 -> A-(C1+C2)
3536 if (N0.getOpcode() == ISD::SUB) {
3537 SDValue N01 = N0.getOperand(1);
3538 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N01, N1}))
3539 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
3542 // fold (c1-A)-c2 -> (c1-c2)-A
3543 if (N0.getOpcode() == ISD::SUB) {
3544 SDValue N00 = N0.getOperand(0);
3545 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N00, N1}))
3546 return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
3549 // fold ((A+(B+or-C))-B) -> A+or-C
3550 if (N0.getOpcode() == ISD::ADD &&
3551 (N0.getOperand(1).getOpcode() == ISD::SUB ||
3552 N0.getOperand(1).getOpcode() == ISD::ADD) &&
3553 N0.getOperand(1).getOperand(0) == N1)
3554 return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
3555 N0.getOperand(1).getOperand(1));
3557 // fold ((A+(C+B))-B) -> A+C
3558 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
3559 N0.getOperand(1).getOperand(1) == N1)
3560 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
3561 N0.getOperand(1).getOperand(0));
3563 // fold ((A-(B-C))-C) -> A-B
3564 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
3565 N0.getOperand(1).getOperand(1) == N1)
3566 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
3567 N0.getOperand(1).getOperand(0));
3569 // fold (A-(B-C)) -> A+(C-B)
3570 if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
3571 return DAG.getNode(ISD::ADD, DL, VT, N0,
3572 DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
3575 // A - (A & B) -> A & (~B)
3576 if (N1.getOpcode() == ISD::AND) {
3577 SDValue A = N1.getOperand(0);
3578 SDValue B = N1.getOperand(1);
3582 (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) {
3584 DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT));
3585 return DAG.getNode(ISD::AND, DL, VT, A, InvB);
3589 // fold (X - (-Y * Z)) -> (X + (Y * Z))
3590 if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
3591 if (N1.getOperand(0).getOpcode() == ISD::SUB &&
3592 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
3593 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
3594 N1.getOperand(0).getOperand(1),
3596 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
3598 if (N1.getOperand(1).getOpcode() == ISD::SUB &&
3599 isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
3600 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
3602 N1.getOperand(1).getOperand(1));
3603 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
3607 // If either operand of a sub is undef, the result is undef
3613 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
3616 if (SDValue V = foldAddSubOfSignBit(N, DAG))
3619 if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
3622 if (SDValue V = foldSubToUSubSat(VT, N))
3625 // (x - y) - 1 -> add (xor y, -1), x
3626 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && isOneOrOneSplat(N1)) {
3627 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
3628 DAG.getAllOnesConstant(DL, VT));
3629 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
3633 // sub y, (xor x, -1)
3634 // And if the target does not like this form then turn into:
3635 // add (add x, y), 1
3636 if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) {
3637 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0));
3638 return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT));
3641 // Hoist one-use addition by non-opaque constant:
3642 // (x + C) - y -> (x - y) + C
3643 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
3644 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
3645 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
3646 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1));
3648 // y - (x + C) -> (y - x) - C
3649 if (N1.getOpcode() == ISD::ADD && N1.hasOneUse() &&
3650 isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) {
3651 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0));
3652 return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1));
3654 // (x - C) - y -> (x - y) - C
3655 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
3656 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
3657 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
3658 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
3659 return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1));
3661 // (C - x) - y -> C - (x + y)
3662 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
3663 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
3664 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1);
3665 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add);
3668 // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1'
3669 // rather than 'sub 0/1' (the sext should get folded).
3670 // sub X, (zext i1 Y) --> add X, (sext i1 Y)
3671 if (N1.getOpcode() == ISD::ZERO_EXTEND &&
3672 N1.getOperand(0).getScalarValueSizeInBits() == 1 &&
3673 TLI.getBooleanContents(VT) ==
3674 TargetLowering::ZeroOrNegativeOneBooleanContent) {
3675 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0));
3676 return DAG.getNode(ISD::ADD, DL, VT, N0, SExt);
3679 // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
3680 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
3681 if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
3682 SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
3683 SDValue S0 = N1.getOperand(0);
3684 if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
3685 if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
3686 if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
3687 return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
3691 // If the relocation model supports it, consider symbol offsets.
3692 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
3693 if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
3694 // fold (sub Sym, c) -> Sym-c
3695 if (N1C && GA->getOpcode() == ISD::GlobalAddress)
3696 return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
3698 (uint64_t)N1C->getSExtValue());
3699 // fold (sub Sym+c1, Sym+c2) -> c1-c2
3700 if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
3701 if (GA->getGlobal() == GB->getGlobal())
3702 return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
3706 // sub X, (sextinreg Y i1) -> add X, (and Y 1)
3707 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
3708 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
3709 if (TN->getVT() == MVT::i1) {
3710 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
3711 DAG.getConstant(1, DL, VT));
3712 return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
3716 // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C))
3717 if (N1.getOpcode() == ISD::VSCALE) {
3718 const APInt &IntVal = N1.getConstantOperandAPInt(0);
3719 return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
3722 // canonicalize (sub X, step_vector(C)) to (add X, step_vector(-C))
3723 if (N1.getOpcode() == ISD::STEP_VECTOR && N1.hasOneUse()) {
3724 APInt NewStep = -N1.getConstantOperandAPInt(0);
3725 return DAG.getNode(ISD::ADD, DL, VT, N0,
3726 DAG.getStepVector(DL, VT, NewStep));
3729 // Prefer an add for more folding potential and possibly better codegen:
3730 // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)
3731 if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
3732 SDValue ShAmt = N1.getOperand(1);
3733 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
3735 ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) {
3736 SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt);
3737 return DAG.getNode(ISD::ADD, DL, VT, N0, SRA);
3741 // As with the previous fold, prefer add for more folding potential.
3742 // Subtracting SMIN/0 is the same as adding SMIN/0:
3743 // N0 - (X << BW-1) --> N0 + (X << BW-1)
3744 if (N1.getOpcode() == ISD::SHL) {
3745 ConstantSDNode *ShlC = isConstOrConstSplat(N1.getOperand(1));
3746 if (ShlC && ShlC->getAPIntValue() == VT.getScalarSizeInBits() - 1)
3747 return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
3750 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
3751 // (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry)
3752 if (SDValue Carry = getAsCarry(TLI, N0)) {
3754 SDValue Zero = DAG.getConstant(0, DL, VT);
3755 SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X);
3756 return DAG.getNode(ISD::ADDCARRY, DL,
3757 DAG.getVTList(VT, Carry.getValueType()), NegX, Zero,
3762 // If there's no chance of borrowing from adjacent bits, then sub is xor:
3763 // sub C0, X --> xor X, C0
3764 if (ConstantSDNode *C0 = isConstOrConstSplat(N0)) {
3765 if (!C0->isOpaque()) {
3766 const APInt &C0Val = C0->getAPIntValue();
3767 const APInt &MaybeOnes = ~DAG.computeKnownBits(N1).Zero;
3768 if ((C0Val - MaybeOnes) == (C0Val ^ MaybeOnes))
3769 return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
3776 SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
3777 SDValue N0 = N->getOperand(0);
3778 SDValue N1 = N->getOperand(1);
3779 EVT VT = N0.getValueType();
3782 // fold (sub_sat x, undef) -> 0
3783 if (N0.isUndef() || N1.isUndef())
3784 return DAG.getConstant(0, DL, VT);
3786 // fold (sub_sat x, x) -> 0
3788 return DAG.getConstant(0, DL, VT);
3790 // fold (sub_sat c1, c2) -> c3
3791 if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))
3795 if (VT.isVector()) {
3796 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
3799 // fold (sub_sat x, 0) -> x, vector edition
3800 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
3804 // fold (sub_sat x, 0) -> x
3805 if (isNullConstant(N1))
3811 SDValue DAGCombiner::visitSUBC(SDNode *N) {
3812 SDValue N0 = N->getOperand(0);
3813 SDValue N1 = N->getOperand(1);
3814 EVT VT = N0.getValueType();
3817 // If the flag result is dead, turn this into an SUB.
3818 if (!N->hasAnyUseOfValue(1))
3819 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
3820 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3822 // fold (subc x, x) -> 0 + no borrow
3824 return CombineTo(N, DAG.getConstant(0, DL, VT),
3825 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3827 // fold (subc x, 0) -> x + no borrow
3828 if (isNullConstant(N1))
3829 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3831 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
3832 if (isAllOnesConstant(N0))
3833 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
3834 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3839 SDValue DAGCombiner::visitSUBO(SDNode *N) {
3840 SDValue N0 = N->getOperand(0);
3841 SDValue N1 = N->getOperand(1);
3842 EVT VT = N0.getValueType();
3843 bool IsSigned = (ISD::SSUBO == N->getOpcode());
3845 EVT CarryVT = N->getValueType(1);
3848 // If the flag result is dead, turn this into an SUB.
3849 if (!N->hasAnyUseOfValue(1))
3850 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
3851 DAG.getUNDEF(CarryVT));
3853 // fold (subo x, x) -> 0 + no borrow
3855 return CombineTo(N, DAG.getConstant(0, DL, VT),
3856 DAG.getConstant(0, DL, CarryVT));
3858 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
3860 // fold (subox, c) -> (addo x, -c)
3861 if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) {
3862 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
3863 DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
3866 // fold (subo x, 0) -> x + no borrow
3867 if (isNullOrNullSplat(N1))
3868 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
3870 // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
3871 if (!IsSigned && isAllOnesOrAllOnesSplat(N0))
3872 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
3873 DAG.getConstant(0, DL, CarryVT));
3878 SDValue DAGCombiner::visitSUBE(SDNode *N) {
3879 SDValue N0 = N->getOperand(0);
3880 SDValue N1 = N->getOperand(1);
3881 SDValue CarryIn = N->getOperand(2);
3883 // fold (sube x, y, false) -> (subc x, y)
3884 if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
3885 return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);
3890 SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
3891 SDValue N0 = N->getOperand(0);
3892 SDValue N1 = N->getOperand(1);
3893 SDValue CarryIn = N->getOperand(2);
3895 // fold (subcarry x, y, false) -> (usubo x, y)
3896 if (isNullConstant(CarryIn)) {
3897 if (!LegalOperations ||
3898 TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
3899 return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
3905 SDValue DAGCombiner::visitSSUBO_CARRY(SDNode *N) {
3906 SDValue N0 = N->getOperand(0);
3907 SDValue N1 = N->getOperand(1);
3908 SDValue CarryIn = N->getOperand(2);
3910 // fold (ssubo_carry x, y, false) -> (ssubo x, y)
3911 if (isNullConstant(CarryIn)) {
3912 if (!LegalOperations ||
3913 TLI.isOperationLegalOrCustom(ISD::SSUBO, N->getValueType(0)))
3914 return DAG.getNode(ISD::SSUBO, SDLoc(N), N->getVTList(), N0, N1);
3920 // Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
3922 SDValue DAGCombiner::visitMULFIX(SDNode *N) {
3923 SDValue N0 = N->getOperand(0);
3924 SDValue N1 = N->getOperand(1);
3925 SDValue Scale = N->getOperand(2);
3926 EVT VT = N0.getValueType();
3928 // fold (mulfix x, undef, scale) -> 0
3929 if (N0.isUndef() || N1.isUndef())
3930 return DAG.getConstant(0, SDLoc(N), VT);
3932 // Canonicalize constant to RHS (vector doesn't have to splat)
3933 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
3934 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
3935 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale);
3937 // fold (mulfix x, 0, scale) -> 0
3938 if (isNullConstant(N1))
3939 return DAG.getConstant(0, SDLoc(N), VT);
3944 SDValue DAGCombiner::visitMUL(SDNode *N) {
3945 SDValue N0 = N->getOperand(0);
3946 SDValue N1 = N->getOperand(1);
3947 EVT VT = N0.getValueType();
3950 // fold (mul x, undef) -> 0
3951 if (N0.isUndef() || N1.isUndef())
3952 return DAG.getConstant(0, DL, VT);
3954 // fold (mul c1, c2) -> c1*c2
3955 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, DL, VT, {N0, N1}))
3958 // canonicalize constant to RHS (vector doesn't have to splat)
3959 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
3960 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
3961 return DAG.getNode(ISD::MUL, DL, VT, N1, N0);
3963 bool N1IsConst = false;
3964 bool N1IsOpaqueConst = false;
3968 if (VT.isVector()) {
3969 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
3972 N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
3973 assert((!N1IsConst ||
3974 ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
3975 "Splat APInt should be element width");
3977 N1IsConst = isa<ConstantSDNode>(N1);
3979 ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
3980 N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
3984 // fold (mul x, 0) -> 0
3985 if (N1IsConst && ConstValue1.isZero())
3988 // fold (mul x, 1) -> x
3989 if (N1IsConst && ConstValue1.isOne())
3992 if (SDValue NewSel = foldBinOpIntoSelect(N))
3995 // fold (mul x, -1) -> 0-x
3996 if (N1IsConst && ConstValue1.isAllOnes())
3997 return DAG.getNode(ISD::SUB, DL, VT,
3998 DAG.getConstant(0, DL, VT), N0);
4000 // fold (mul x, (1 << c)) -> x << c
4001 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
4002 DAG.isKnownToBeAPowerOfTwo(N1) &&
4003 (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
4004 SDValue LogBase2 = BuildLogBase2(N1, DL);
4005 EVT ShiftVT = getShiftAmountTy(N0.getValueType());
4006 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
4007 return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
4010 // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
4011 if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isNegatedPowerOf2()) {
4012 unsigned Log2Val = (-ConstValue1).logBase2();
4013 // FIXME: If the input is something that is easily negated (e.g. a
4014 // single-use add), we should put the negate there.
4015 return DAG.getNode(ISD::SUB, DL, VT,
4016 DAG.getConstant(0, DL, VT),
4017 DAG.getNode(ISD::SHL, DL, VT, N0,
4018 DAG.getConstant(Log2Val, DL,
4019 getShiftAmountTy(N0.getValueType()))));
4022 // Try to transform:
4023 // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
4024 // mul x, (2^N + 1) --> add (shl x, N), x
4025 // mul x, (2^N - 1) --> sub (shl x, N), x
4026 // Examples: x * 33 --> (x << 5) + x
4027 // x * 15 --> (x << 4) - x
4028 // x * -33 --> -((x << 5) + x)
4029 // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
4030 // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
4031 // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
4032 // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
4033 // Examples: x * 0x8800 --> (x << 15) + (x << 11)
4034 // x * 0xf800 --> (x << 16) - (x << 11)
4035 // x * -0x8800 --> -((x << 15) + (x << 11))
4036 // x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
4037 if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
4038 // TODO: We could handle more general decomposition of any constant by
4039 // having the target set a limit on number of ops and making a
4040 // callback to determine that sequence (similar to sqrt expansion).
4041 unsigned MathOp = ISD::DELETED_NODE;
4042 APInt MulC = ConstValue1.abs();
4043 // The constant `2` should be treated as (2^0 + 1).
4044 unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros();
4045 MulC.lshrInPlace(TZeros);
4046 if ((MulC - 1).isPowerOf2())
4048 else if ((MulC + 1).isPowerOf2())
4051 if (MathOp != ISD::DELETED_NODE) {
4053 MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2();
4055 assert(ShAmt < VT.getScalarSizeInBits() &&
4056 "multiply-by-constant generated out of bounds shift");
4058 DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
4060 TZeros ? DAG.getNode(MathOp, DL, VT, Shl,
4061 DAG.getNode(ISD::SHL, DL, VT, N0,
4062 DAG.getConstant(TZeros, DL, VT)))
4063 : DAG.getNode(MathOp, DL, VT, Shl, N0);
4064 if (ConstValue1.isNegative())
4065 R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R);
4070 // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
4071 if (N0.getOpcode() == ISD::SHL) {
4072 SDValue N01 = N0.getOperand(1);
4073 if (SDValue C3 = DAG.FoldConstantArithmetic(ISD::SHL, DL, VT, {N1, N01}))
4074 return DAG.getNode(ISD::MUL, DL, VT, N0.getOperand(0), C3);
4077 // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
4082 // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)).
4083 if (N0.getOpcode() == ISD::SHL &&
4084 isConstantOrConstantVector(N0.getOperand(1)) && N0->hasOneUse()) {
4086 } else if (N1.getOpcode() == ISD::SHL &&
4087 isConstantOrConstantVector(N1.getOperand(1)) &&
4093 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, Sh.getOperand(0), Y);
4094 return DAG.getNode(ISD::SHL, DL, VT, Mul, Sh.getOperand(1));
4098 // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
4099 if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
4100 N0.getOpcode() == ISD::ADD &&
4101 DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
4102 isMulAddWithConstProfitable(N, N0, N1))
4105 DAG.getNode(ISD::MUL, SDLoc(N0), VT, N0.getOperand(0), N1),
4106 DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1));
4108 // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
4109 if (N0.getOpcode() == ISD::VSCALE)
4110 if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {
4111 const APInt &C0 = N0.getConstantOperandAPInt(0);
4112 const APInt &C1 = NC1->getAPIntValue();
4113 return DAG.getVScale(DL, VT, C0 * C1);
4116 // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)).
4118 if (N0.getOpcode() == ISD::STEP_VECTOR)
4119 if (ISD::isConstantSplatVector(N1.getNode(), MulVal)) {
4120 const APInt &C0 = N0.getConstantOperandAPInt(0);
4121 APInt NewStep = C0 * MulVal;
4122 return DAG.getStepVector(DL, VT, NewStep);
4125 // Fold ((mul x, 0/undef) -> 0,
4126 // (mul x, 1) -> x) -> x)
4128 // We can replace vectors with '0' and '1' factors with a clearing mask.
4129 if (VT.isFixedLengthVector()) {
4130 unsigned NumElts = VT.getVectorNumElements();
4131 SmallBitVector ClearMask;
4132 ClearMask.reserve(NumElts);
4133 auto IsClearMask = [&ClearMask](ConstantSDNode *V) {
4134 if (!V || V->isZero()) {
4135 ClearMask.push_back(true);
4138 ClearMask.push_back(false);
4141 if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) &&
4142 ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) {
4143 assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector");
4144 EVT LegalSVT = N1.getOperand(0).getValueType();
4145 SDValue Zero = DAG.getConstant(0, DL, LegalSVT);
4146 SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT);
4147 SmallVector<SDValue, 16> Mask(NumElts, AllOnes);
4148 for (unsigned I = 0; I != NumElts; ++I)
4151 return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getBuildVector(VT, DL, Mask));
4156 if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
4159 // Simplify the operands using demanded-bits information.
4160 if (SimplifyDemandedBits(SDValue(N, 0)))
4161 return SDValue(N, 0);
4166 /// Return true if divmod libcall is available.
4167 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
4168 const TargetLowering &TLI) {
4170 EVT NodeType = Node->getValueType(0);
4171 if (!NodeType.isSimple())
4173 switch (NodeType.getSimpleVT().SimpleTy) {
4174 default: return false; // No libcall for vector types.
4175 case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
4176 case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
4177 case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
4178 case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
4179 case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
4182 return TLI.getLibcallName(LC) != nullptr;
4185 /// Issue divrem if both quotient and remainder are needed.
4186 SDValue DAGCombiner::useDivRem(SDNode *Node) {
4187 if (Node->use_empty())
4188 return SDValue(); // This is a dead node, leave it alone.
4190 unsigned Opcode = Node->getOpcode();
4191 bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
4192 unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
4194 // DivMod lib calls can still work on non-legal types if using lib-calls.
4195 EVT VT = Node->getValueType(0);
4196 if (VT.isVector() || !VT.isInteger())
4199 if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
4202 // If DIVREM is going to get expanded into a libcall,
4203 // but there is no libcall available, then don't combine.
4204 if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
4205 !isDivRemLibcallAvailable(Node, isSigned, TLI))
4208 // If div is legal, it's better to do the normal expansion
4209 unsigned OtherOpcode = 0;
4210 if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) {
4211 OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
4212 if (TLI.isOperationLegalOrCustom(Opcode, VT))
4215 OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
4216 if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
4220 SDValue Op0 = Node->getOperand(0);
4221 SDValue Op1 = Node->getOperand(1);
4223 for (SDNode *User : Op0->uses()) {
4224 if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
4227 // Convert the other matching node(s), too;
4228 // otherwise, the DIVREM may get target-legalized into something
4229 // target-specific that we won't be able to recognize.
4230 unsigned UserOpc = User->getOpcode();
4231 if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) &&
4232 User->getOperand(0) == Op0 &&
4233 User->getOperand(1) == Op1) {
4235 if (UserOpc == OtherOpcode) {
4236 SDVTList VTs = DAG.getVTList(VT, VT);
4237 combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
4238 } else if (UserOpc == DivRemOpc) {
4239 combined = SDValue(User, 0);
4241 assert(UserOpc == Opcode);
4245 if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV)
4246 CombineTo(User, combined);
4247 else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM)
4248 CombineTo(User, combined.getValue(1));
4254 static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
4255 SDValue N0 = N->getOperand(0);
4256 SDValue N1 = N->getOperand(1);
4257 EVT VT = N->getValueType(0);
4260 unsigned Opc = N->getOpcode();
4261 bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc);
4262 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4264 // X / undef -> undef
4265 // X % undef -> undef
4268 // NOTE: This includes vectors where any divisor element is zero/undef.
4269 if (DAG.isUndef(Opc, {N0, N1}))
4270 return DAG.getUNDEF(VT);
4275 return DAG.getConstant(0, DL, VT);
4279 ConstantSDNode *N0C = isConstOrConstSplat(N0);
4280 if (N0C && N0C->isZero())
4286 return DAG.getConstant(IsDiv ? 1 : 0, DL, VT);
4290 // If this is a boolean op (single-bit element type), we can't have
4291 // division-by-zero or remainder-by-zero, so assume the divisor is 1.
4292 // TODO: Similarly, if we're zero-extending a boolean divisor, then assume
4294 if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
4295 return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
4300 SDValue DAGCombiner::visitSDIV(SDNode *N) {
4301 SDValue N0 = N->getOperand(0);
4302 SDValue N1 = N->getOperand(1);
4303 EVT VT = N->getValueType(0);
4304 EVT CCVT = getSetCCResultType(VT);
4307 // fold (sdiv c1, c2) -> c1/c2
4308 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
4313 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4316 // fold (sdiv X, -1) -> 0-X
4317 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4318 if (N1C && N1C->isAllOnes())
4319 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
4321 // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
4322 if (N1C && N1C->getAPIntValue().isMinSignedValue())
4323 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
4324 DAG.getConstant(1, DL, VT),
4325 DAG.getConstant(0, DL, VT));
4327 if (SDValue V = simplifyDivRem(N, DAG))
4330 if (SDValue NewSel = foldBinOpIntoSelect(N))
4333 // If we know the sign bits of both operands are zero, strength reduce to a
4334 // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2
4335 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
4336 return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
4338 if (SDValue V = visitSDIVLike(N0, N1, N)) {
4339 // If the corresponding remainder node exists, update its users with
4340 // (Dividend - (Quotient * Divisor).
4341 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(),
4343 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
4344 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
4345 AddToWorklist(Mul.getNode());
4346 AddToWorklist(Sub.getNode());
4347 CombineTo(RemNode, Sub);
4352 // sdiv, srem -> sdivrem
4353 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
4354 // true. Otherwise, we break the simplification logic in visitREM().
4355 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4356 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
4357 if (SDValue DivRem = useDivRem(N))
4363 static bool isDivisorPowerOfTwo(SDValue Divisor) {
4364 // Helper for determining whether a value is a power-2 constant scalar or a
4365 // vector of such elements.
4366 auto IsPowerOfTwo = [](ConstantSDNode *C) {
4367 if (C->isZero() || C->isOpaque())
4369 if (C->getAPIntValue().isPowerOf2())
4371 if (C->getAPIntValue().isNegatedPowerOf2())
4376 return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo);
4379 SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
4381 EVT VT = N->getValueType(0);
4382 EVT CCVT = getSetCCResultType(VT);
4383 unsigned BitWidth = VT.getScalarSizeInBits();
4385 // fold (sdiv X, pow2) -> simple ops after legalize
4386 // FIXME: We check for the exact bit here because the generic lowering gives
4387 // better results in that case. The target-specific lowering should learn how
4388 // to handle exact sdivs efficiently.
4389 if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) {
4390 // Target-specific implementation of sdiv x, pow2.
4391 if (SDValue Res = BuildSDIVPow2(N))
4394 // Create constants that are functions of the shift amount value.
4395 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
4396 SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy);
4397 SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1);
4398 C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy);
4399 SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1);
4400 if (!isConstantOrConstantVector(Inexact))
4403 // Splat the sign bit into the register
4404 SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0,
4405 DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy));
4406 AddToWorklist(Sign.getNode());
4408 // Add (N0 < 0) ? abs2 - 1 : 0;
4409 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact);
4410 AddToWorklist(Srl.getNode());
4411 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl);
4412 AddToWorklist(Add.getNode());
4413 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1);
4414 AddToWorklist(Sra.getNode());
4416 // Special case: (sdiv X, 1) -> X
4417 // Special Case: (sdiv X, -1) -> 0-X
4418 SDValue One = DAG.getConstant(1, DL, VT);
4419 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
4420 SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ);
4421 SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ);
4422 SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes);
4423 Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra);
4425 // If dividing by a positive value, we're done. Otherwise, the result must
4427 SDValue Zero = DAG.getConstant(0, DL, VT);
4428 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra);
4430 // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
4431 SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT);
4432 SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra);
4436 // If integer divide is expensive and we satisfy the requirements, emit an
4437 // alternate sequence. Targets may check function attributes for size/speed
4439 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4440 if (isConstantOrConstantVector(N1) &&
4441 !TLI.isIntDivCheap(N->getValueType(0), Attr))
4442 if (SDValue Op = BuildSDIV(N))
4448 SDValue DAGCombiner::visitUDIV(SDNode *N) {
4449 SDValue N0 = N->getOperand(0);
4450 SDValue N1 = N->getOperand(1);
4451 EVT VT = N->getValueType(0);
4452 EVT CCVT = getSetCCResultType(VT);
4455 // fold (udiv c1, c2) -> c1/c2
4456 if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
4461 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4464 // fold (udiv X, -1) -> select(X == -1, 1, 0)
4465 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4466 if (N1C && N1C->isAllOnes())
4467 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
4468 DAG.getConstant(1, DL, VT),
4469 DAG.getConstant(0, DL, VT));
4471 if (SDValue V = simplifyDivRem(N, DAG))
4474 if (SDValue NewSel = foldBinOpIntoSelect(N))
4477 if (SDValue V = visitUDIVLike(N0, N1, N)) {
4478 // If the corresponding remainder node exists, update its users with
4479 // (Dividend - (Quotient * Divisor).
4480 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(),
4482 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
4483 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
4484 AddToWorklist(Mul.getNode());
4485 AddToWorklist(Sub.getNode());
4486 CombineTo(RemNode, Sub);
4491 // sdiv, srem -> sdivrem
4492 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
4493 // true. Otherwise, we break the simplification logic in visitREM().
4494 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4495 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
4496 if (SDValue DivRem = useDivRem(N))
4502 SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
4504 EVT VT = N->getValueType(0);
4506 // fold (udiv x, (1 << c)) -> x >>u c
4507 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
4508 DAG.isKnownToBeAPowerOfTwo(N1)) {
4509 SDValue LogBase2 = BuildLogBase2(N1, DL);
4510 AddToWorklist(LogBase2.getNode());
4512 EVT ShiftVT = getShiftAmountTy(N0.getValueType());
4513 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
4514 AddToWorklist(Trunc.getNode());
4515 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
4518 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
4519 if (N1.getOpcode() == ISD::SHL) {
4520 SDValue N10 = N1.getOperand(0);
4521 if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) &&
4522 DAG.isKnownToBeAPowerOfTwo(N10)) {
4523 SDValue LogBase2 = BuildLogBase2(N10, DL);
4524 AddToWorklist(LogBase2.getNode());
4526 EVT ADDVT = N1.getOperand(1).getValueType();
4527 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
4528 AddToWorklist(Trunc.getNode());
4529 SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
4530 AddToWorklist(Add.getNode());
4531 return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
4535 // fold (udiv x, c) -> alternate
4536 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4537 if (isConstantOrConstantVector(N1) &&
4538 !TLI.isIntDivCheap(N->getValueType(0), Attr))
4539 if (SDValue Op = BuildUDIV(N))
4545 SDValue DAGCombiner::buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N) {
4546 if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1) &&
4547 !DAG.doesNodeExist(ISD::SDIV, N->getVTList(), {N0, N1})) {
4548 // Target-specific implementation of srem x, pow2.
4549 if (SDValue Res = BuildSREMPow2(N))
4555 // handles ISD::SREM and ISD::UREM
4556 SDValue DAGCombiner::visitREM(SDNode *N) {
4557 unsigned Opcode = N->getOpcode();
4558 SDValue N0 = N->getOperand(0);
4559 SDValue N1 = N->getOperand(1);
4560 EVT VT = N->getValueType(0);
4561 EVT CCVT = getSetCCResultType(VT);
4563 bool isSigned = (Opcode == ISD::SREM);
4566 // fold (rem c1, c2) -> c1%c2
4567 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
4570 // fold (urem X, -1) -> select(FX == -1, 0, FX)
4571 // Freeze the numerator to avoid a miscompile with an undefined value.
4572 if (!isSigned && llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false)) {
4573 SDValue F0 = DAG.getFreeze(N0);
4574 SDValue EqualsNeg1 = DAG.getSetCC(DL, CCVT, F0, N1, ISD::SETEQ);
4575 return DAG.getSelect(DL, VT, EqualsNeg1, DAG.getConstant(0, DL, VT), F0);
4578 if (SDValue V = simplifyDivRem(N, DAG))
4581 if (SDValue NewSel = foldBinOpIntoSelect(N))
4585 // If we know the sign bits of both operands are zero, strength reduce to a
4586 // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15
4587 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
4588 return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
4590 if (DAG.isKnownToBeAPowerOfTwo(N1)) {
4591 // fold (urem x, pow2) -> (and x, pow2-1)
4592 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
4593 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
4594 AddToWorklist(Add.getNode());
4595 return DAG.getNode(ISD::AND, DL, VT, N0, Add);
4597 // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
4598 // fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
4599 // TODO: We should sink the following into isKnownToBePowerOfTwo
4600 // using a OrZero parameter analogous to our handling in ValueTracking.
4601 if ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) &&
4602 DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
4603 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
4604 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
4605 AddToWorklist(Add.getNode());
4606 return DAG.getNode(ISD::AND, DL, VT, N0, Add);
4610 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4612 // If X/C can be simplified by the division-by-constant logic, lower
4613 // X%C to the equivalent of X-X/C*C.
4614 // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
4615 // speculative DIV must not cause a DIVREM conversion. We guard against this
4616 // by skipping the simplification if isIntDivCheap(). When div is not cheap,
4617 // combine will not return a DIVREM. Regardless, checking cheapness here
4618 // makes sense since the simplification results in fatter code.
4619 if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
4621 // check if we can build faster implementation for srem
4622 if (SDValue OptimizedRem = buildOptimizedSREM(N0, N1, N))
4623 return OptimizedRem;
4626 SDValue OptimizedDiv =
4627 isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
4628 if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
4629 // If the equivalent Div node also exists, update its users.
4630 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
4631 if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(),
4633 CombineTo(DivNode, OptimizedDiv);
4634 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
4635 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
4636 AddToWorklist(OptimizedDiv.getNode());
4637 AddToWorklist(Mul.getNode());
4642 // sdiv, srem -> sdivrem
4643 if (SDValue DivRem = useDivRem(N))
4644 return DivRem.getValue(1);
4649 SDValue DAGCombiner::visitMULHS(SDNode *N) {
4650 SDValue N0 = N->getOperand(0);
4651 SDValue N1 = N->getOperand(1);
4652 EVT VT = N->getValueType(0);
4655 // fold (mulhs c1, c2)
4656 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1}))
4659 // canonicalize constant to RHS.
4660 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4661 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4662 return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0);
4664 if (VT.isVector()) {
4665 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4668 // fold (mulhs x, 0) -> 0
4669 // do not return N1, because undef node may exist.
4670 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
4671 return DAG.getConstant(0, DL, VT);
4674 // fold (mulhs x, 0) -> 0
4675 if (isNullConstant(N1))
4678 // fold (mulhs x, 1) -> (sra x, size(x)-1)
4679 if (isOneConstant(N1))
4680 return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
4681 DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
4682 getShiftAmountTy(N0.getValueType())));
4684 // fold (mulhs x, undef) -> 0
4685 if (N0.isUndef() || N1.isUndef())
4686 return DAG.getConstant(0, DL, VT);
4688 // If the type twice as wide is legal, transform the mulhs to a wider multiply
4690 if (!TLI.isOperationLegalOrCustom(ISD::MULHS, VT) && VT.isSimple() &&
4692 MVT Simple = VT.getSimpleVT();
4693 unsigned SimpleSize = Simple.getSizeInBits();
4694 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4695 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4696 N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
4697 N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
4698 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
4699 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
4700 DAG.getConstant(SimpleSize, DL,
4701 getShiftAmountTy(N1.getValueType())));
4702 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
4709 SDValue DAGCombiner::visitMULHU(SDNode *N) {
4710 SDValue N0 = N->getOperand(0);
4711 SDValue N1 = N->getOperand(1);
4712 EVT VT = N->getValueType(0);
4715 // fold (mulhu c1, c2)
4716 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1}))
4719 // canonicalize constant to RHS.
4720 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4721 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4722 return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0);
4724 if (VT.isVector()) {
4725 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4728 // fold (mulhu x, 0) -> 0
4729 // do not return N1, because undef node may exist.
4730 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
4731 return DAG.getConstant(0, DL, VT);
4734 // fold (mulhu x, 0) -> 0
4735 if (isNullConstant(N1))
4738 // fold (mulhu x, 1) -> 0
4739 if (isOneConstant(N1))
4740 return DAG.getConstant(0, DL, N0.getValueType());
4742 // fold (mulhu x, undef) -> 0
4743 if (N0.isUndef() || N1.isUndef())
4744 return DAG.getConstant(0, DL, VT);
4746 // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
4747 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
4748 DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) {
4749 unsigned NumEltBits = VT.getScalarSizeInBits();
4750 SDValue LogBase2 = BuildLogBase2(N1, DL);
4751 SDValue SRLAmt = DAG.getNode(
4752 ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2);
4753 EVT ShiftVT = getShiftAmountTy(N0.getValueType());
4754 SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT);
4755 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
4758 // If the type twice as wide is legal, transform the mulhu to a wider multiply
4760 if (!TLI.isOperationLegalOrCustom(ISD::MULHU, VT) && VT.isSimple() &&
4762 MVT Simple = VT.getSimpleVT();
4763 unsigned SimpleSize = Simple.getSizeInBits();
4764 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4765 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4766 N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
4767 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
4768 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
4769 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
4770 DAG.getConstant(SimpleSize, DL,
4771 getShiftAmountTy(N1.getValueType())));
4772 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
4776 // Simplify the operands using demanded-bits information.
4777 // We don't have demanded bits support for MULHU so this just enables constant
4778 // folding based on known bits.
4779 if (SimplifyDemandedBits(SDValue(N, 0)))
4780 return SDValue(N, 0);
4785 SDValue DAGCombiner::visitAVG(SDNode *N) {
4786 unsigned Opcode = N->getOpcode();
4787 SDValue N0 = N->getOperand(0);
4788 SDValue N1 = N->getOperand(1);
4789 EVT VT = N->getValueType(0);
4792 // fold (avg c1, c2)
4793 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
4796 // canonicalize constant to RHS.
4797 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4798 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4799 return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
4801 if (VT.isVector()) {
4802 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4805 // fold (avgfloor x, 0) -> x >> 1
4806 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
4807 if (Opcode == ISD::AVGFLOORS)
4808 return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
4809 if (Opcode == ISD::AVGFLOORU)
4810 return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
4814 // fold (avg x, undef) -> x
4820 // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
4825 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
4826 /// give the opcodes for the two computations that are being performed. Return
4827 /// true if a simplification was made.
4828 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
4830 // If the high half is not needed, just compute the low half.
4831 bool HiExists = N->hasAnyUseOfValue(1);
4832 if (!HiExists && (!LegalOperations ||
4833 TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
4834 SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
4835 return CombineTo(N, Res, Res);
4838 // If the low half is not needed, just compute the high half.
4839 bool LoExists = N->hasAnyUseOfValue(0);
4840 if (!LoExists && (!LegalOperations ||
4841 TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) {
4842 SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
4843 return CombineTo(N, Res, Res);
4846 // If both halves are used, return as it is.
4847 if (LoExists && HiExists)
4850 // If the two computed results can be simplified separately, separate them.
4852 SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
4853 AddToWorklist(Lo.getNode());
4854 SDValue LoOpt = combine(Lo.getNode());
4855 if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
4856 (!LegalOperations ||
4857 TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType())))
4858 return CombineTo(N, LoOpt, LoOpt);
4862 SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
4863 AddToWorklist(Hi.getNode());
4864 SDValue HiOpt = combine(Hi.getNode());
4865 if (HiOpt.getNode() && HiOpt != Hi &&
4866 (!LegalOperations ||
4867 TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType())))
4868 return CombineTo(N, HiOpt, HiOpt);
4874 SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
4875 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
4878 SDValue N0 = N->getOperand(0);
4879 SDValue N1 = N->getOperand(1);
4880 EVT VT = N->getValueType(0);
4883 // canonicalize constant to RHS (vector doesn't have to splat)
4884 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4885 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4886 return DAG.getNode(ISD::SMUL_LOHI, DL, N->getVTList(), N1, N0);
4888 // If the type is twice as wide is legal, transform the mulhu to a wider
4889 // multiply plus a shift.
4890 if (VT.isSimple() && !VT.isVector()) {
4891 MVT Simple = VT.getSimpleVT();
4892 unsigned SimpleSize = Simple.getSizeInBits();
4893 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4894 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4895 SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
4896 SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
4897 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
4898 // Compute the high part as N1.
4899 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
4900 DAG.getConstant(SimpleSize, DL,
4901 getShiftAmountTy(Lo.getValueType())));
4902 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
4903 // Compute the low part as N0.
4904 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
4905 return CombineTo(N, Lo, Hi);
4912 SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
4913 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
4916 SDValue N0 = N->getOperand(0);
4917 SDValue N1 = N->getOperand(1);
4918 EVT VT = N->getValueType(0);
4921 // canonicalize constant to RHS (vector doesn't have to splat)
4922 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4923 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4924 return DAG.getNode(ISD::UMUL_LOHI, DL, N->getVTList(), N1, N0);
4926 // (umul_lohi N0, 0) -> (0, 0)
4927 if (isNullConstant(N1)) {
4928 SDValue Zero = DAG.getConstant(0, DL, VT);
4929 return CombineTo(N, Zero, Zero);
4932 // (umul_lohi N0, 1) -> (N0, 0)
4933 if (isOneConstant(N1)) {
4934 SDValue Zero = DAG.getConstant(0, DL, VT);
4935 return CombineTo(N, N0, Zero);
4938 // If the type is twice as wide is legal, transform the mulhu to a wider
4939 // multiply plus a shift.
4940 if (VT.isSimple() && !VT.isVector()) {
4941 MVT Simple = VT.getSimpleVT();
4942 unsigned SimpleSize = Simple.getSizeInBits();
4943 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4944 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4945 SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
4946 SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
4947 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
4948 // Compute the high part as N1.
4949 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
4950 DAG.getConstant(SimpleSize, DL,
4951 getShiftAmountTy(Lo.getValueType())));
4952 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
4953 // Compute the low part as N0.
4954 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
4955 return CombineTo(N, Lo, Hi);
4962 SDValue DAGCombiner::visitMULO(SDNode *N) {
4963 SDValue N0 = N->getOperand(0);
4964 SDValue N1 = N->getOperand(1);
4965 EVT VT = N0.getValueType();
4966 bool IsSigned = (ISD::SMULO == N->getOpcode());
4968 EVT CarryVT = N->getValueType(1);
4971 ConstantSDNode *N0C = isConstOrConstSplat(N0);
4972 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4974 // fold operation with constant operands.
4975 // TODO: Move this to FoldConstantArithmetic when it supports nodes with
4976 // multiple results.
4980 IsSigned ? N0C->getAPIntValue().smul_ov(N1C->getAPIntValue(), Overflow)
4981 : N0C->getAPIntValue().umul_ov(N1C->getAPIntValue(), Overflow);
4982 return CombineTo(N, DAG.getConstant(Result, DL, VT),
4983 DAG.getBoolConstant(Overflow, DL, CarryVT, CarryVT));
4986 // canonicalize constant to RHS.
4987 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4988 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4989 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
4991 // fold (mulo x, 0) -> 0 + no carry out
4992 if (isNullOrNullSplat(N1))
4993 return CombineTo(N, DAG.getConstant(0, DL, VT),
4994 DAG.getConstant(0, DL, CarryVT));
4996 // (mulo x, 2) -> (addo x, x)
4997 // FIXME: This needs a freeze.
4998 if (N1C && N1C->getAPIntValue() == 2 &&
4999 (!IsSigned || VT.getScalarSizeInBits() > 2))
5000 return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
5001 N->getVTList(), N0, N0);
5004 // A 1 bit SMULO overflows if both inputs are 1.
5005 if (VT.getScalarSizeInBits() == 1) {
5006 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, N1);
5007 return CombineTo(N, And,
5008 DAG.getSetCC(DL, CarryVT, And,
5009 DAG.getConstant(0, DL, VT), ISD::SETNE));
5012 // Multiplying n * m significant bits yields a result of n + m significant
5013 // bits. If the total number of significant bits does not exceed the
5014 // result bit width (minus 1), there is no overflow.
5015 unsigned SignBits = DAG.ComputeNumSignBits(N0);
5017 SignBits += DAG.ComputeNumSignBits(N1);
5018 if (SignBits > VT.getScalarSizeInBits() + 1)
5019 return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
5020 DAG.getConstant(0, DL, CarryVT));
5022 KnownBits N1Known = DAG.computeKnownBits(N1);
5023 KnownBits N0Known = DAG.computeKnownBits(N0);
5025 (void)N0Known.getMaxValue().umul_ov(N1Known.getMaxValue(), Overflow);
5027 return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
5028 DAG.getConstant(0, DL, CarryVT));
5034 // Function to calculate whether the Min/Max pair of SDNodes (potentially
5035 // swapped around) make a signed saturate pattern, clamping to between a signed
5036 // saturate of -2^(BW-1) and 2^(BW-1)-1, or an unsigned saturate of 0 and 2^BW.
5037 // Returns the node being clamped and the bitwidth of the clamp in BW. Should
5038 // work with both SMIN/SMAX nodes and setcc/select combo. The operands are the
5039 // same as SimplifySelectCC. N0<N1 ? N2 : N3.
5040 static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
5041 SDValue N3, ISD::CondCode CC, unsigned &BW,
5043 auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3,
5045 // The compare and select operand should be the same or the select operands
5046 // should be truncated versions of the comparison.
5047 if (N0 != N2 && (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0)))
5049 // The constants need to be the same or a truncated version of each other.
5050 ConstantSDNode *N1C = isConstOrConstSplat(N1);
5051 ConstantSDNode *N3C = isConstOrConstSplat(N3);
5054 const APInt &C1 = N1C->getAPIntValue();
5055 const APInt &C2 = N3C->getAPIntValue();
5056 if (C1.getBitWidth() < C2.getBitWidth() || C1 != C2.sext(C1.getBitWidth()))
5058 return CC == ISD::SETLT ? ISD::SMIN : (CC == ISD::SETGT ? ISD::SMAX : 0);
5061 // Check the initial value is a SMIN/SMAX equivalent.
5062 unsigned Opcode0 = isSignedMinMax(N0, N1, N2, N3, CC);
5066 SDValue N00, N01, N02, N03;
5068 switch (N0.getOpcode()) {
5071 N00 = N02 = N0.getOperand(0);
5072 N01 = N03 = N0.getOperand(1);
5073 N0CC = N0.getOpcode() == ISD::SMIN ? ISD::SETLT : ISD::SETGT;
5075 case ISD::SELECT_CC:
5076 N00 = N0.getOperand(0);
5077 N01 = N0.getOperand(1);
5078 N02 = N0.getOperand(2);
5079 N03 = N0.getOperand(3);
5080 N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
5084 if (N0.getOperand(0).getOpcode() != ISD::SETCC)
5086 N00 = N0.getOperand(0).getOperand(0);
5087 N01 = N0.getOperand(0).getOperand(1);
5088 N02 = N0.getOperand(1);
5089 N03 = N0.getOperand(2);
5090 N0CC = cast<CondCodeSDNode>(N0.getOperand(0).getOperand(2))->get();
5096 unsigned Opcode1 = isSignedMinMax(N00, N01, N02, N03, N0CC);
5097 if (!Opcode1 || Opcode0 == Opcode1)
5100 ConstantSDNode *MinCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N1 : N01);
5101 ConstantSDNode *MaxCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N01 : N1);
5102 if (!MinCOp || !MaxCOp || MinCOp->getValueType(0) != MaxCOp->getValueType(0))
5105 const APInt &MinC = MinCOp->getAPIntValue();
5106 const APInt &MaxC = MaxCOp->getAPIntValue();
5107 APInt MinCPlus1 = MinC + 1;
5108 if (-MaxC == MinCPlus1 && MinCPlus1.isPowerOf2()) {
5109 BW = MinCPlus1.exactLogBase2() + 1;
5114 if (MaxC == 0 && MinCPlus1.isPowerOf2()) {
5115 BW = MinCPlus1.exactLogBase2();
5123 static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
5124 SDValue N3, ISD::CondCode CC,
5125 SelectionDAG &DAG) {
5128 SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned);
5129 if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT)
5131 EVT FPVT = Fp.getOperand(0).getValueType();
5132 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
5133 if (FPVT.isVector())
5134 NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
5135 FPVT.getVectorElementCount());
5136 unsigned NewOpc = Unsigned ? ISD::FP_TO_UINT_SAT : ISD::FP_TO_SINT_SAT;
5137 if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(NewOpc, FPVT, NewVT))
5140 SDValue Sat = DAG.getNode(NewOpc, DL, NewVT, Fp.getOperand(0),
5141 DAG.getValueType(NewVT.getScalarType()));
5142 return Unsigned ? DAG.getZExtOrTrunc(Sat, DL, N2->getValueType(0))
5143 : DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0));
5146 static SDValue PerformUMinFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
5147 SDValue N3, ISD::CondCode CC,
5148 SelectionDAG &DAG) {
5149 // We are looking for UMIN(FPTOUI(X), (2^n)-1), which may have come via a
5150 // select/vselect/select_cc. The two operands pairs for the select (N2/N3) may
5151 // be truncated versions of the the setcc (N0/N1).
5153 (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0))) ||
5154 N0.getOpcode() != ISD::FP_TO_UINT || CC != ISD::SETULT)
5156 ConstantSDNode *N1C = isConstOrConstSplat(N1);
5157 ConstantSDNode *N3C = isConstOrConstSplat(N3);
5160 const APInt &C1 = N1C->getAPIntValue();
5161 const APInt &C3 = N3C->getAPIntValue();
5162 if (!(C1 + 1).isPowerOf2() || C1.getBitWidth() < C3.getBitWidth() ||
5163 C1 != C3.zext(C1.getBitWidth()))
5166 unsigned BW = (C1 + 1).exactLogBase2();
5167 EVT FPVT = N0.getOperand(0).getValueType();
5168 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
5169 if (FPVT.isVector())
5170 NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
5171 FPVT.getVectorElementCount());
5172 if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(ISD::FP_TO_UINT_SAT,
5177 DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(N0), NewVT, N0.getOperand(0),
5178 DAG.getValueType(NewVT.getScalarType()));
5179 return DAG.getZExtOrTrunc(Sat, SDLoc(N0), N3.getValueType());
5182 SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
5183 SDValue N0 = N->getOperand(0);
5184 SDValue N1 = N->getOperand(1);
5185 EVT VT = N0.getValueType();
5186 unsigned Opcode = N->getOpcode();
5189 // fold operation with constant operands.
5190 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
5193 // If the operands are the same, this is a no-op.
5197 // canonicalize constant to RHS
5198 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
5199 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
5200 return DAG.getNode(Opcode, DL, VT, N1, N0);
5204 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
5207 // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
5208 // Only do this if the current op isn't legal and the flipped is.
5209 if (!TLI.isOperationLegal(Opcode, VT) &&
5210 (N0.isUndef() || DAG.SignBitIsZero(N0)) &&
5211 (N1.isUndef() || DAG.SignBitIsZero(N1))) {
5214 case ISD::SMIN: AltOpcode = ISD::UMIN; break;
5215 case ISD::SMAX: AltOpcode = ISD::UMAX; break;
5216 case ISD::UMIN: AltOpcode = ISD::SMIN; break;
5217 case ISD::UMAX: AltOpcode = ISD::SMAX; break;
5218 default: llvm_unreachable("Unknown MINMAX opcode");
5220 if (TLI.isOperationLegal(AltOpcode, VT))
5221 return DAG.getNode(AltOpcode, DL, VT, N0, N1);
5224 if (Opcode == ISD::SMIN || Opcode == ISD::SMAX)
5225 if (SDValue S = PerformMinMaxFpToSatCombine(
5226 N0, N1, N0, N1, Opcode == ISD::SMIN ? ISD::SETLT : ISD::SETGT, DAG))
5228 if (Opcode == ISD::UMIN)
5229 if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG))
5232 // Simplify the operands using demanded-bits information.
5233 if (SimplifyDemandedBits(SDValue(N, 0)))
5234 return SDValue(N, 0);
5239 /// If this is a bitwise logic instruction and both operands have the same
5240 /// opcode, try to sink the other opcode after the logic instruction.
5241 SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
5242 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
5243 EVT VT = N0.getValueType();
5244 unsigned LogicOpcode = N->getOpcode();
5245 unsigned HandOpcode = N0.getOpcode();
5246 assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR ||
5247 LogicOpcode == ISD::XOR) && "Expected logic opcode");
5248 assert(HandOpcode == N1.getOpcode() && "Bad input!");
5250 // Bail early if none of these transforms apply.
5251 if (N0.getNumOperands() == 0)
5254 // FIXME: We should check number of uses of the operands to not increase
5255 // the instruction count for all transforms.
5257 // Handle size-changing casts.
5258 SDValue X = N0.getOperand(0);
5259 SDValue Y = N1.getOperand(0);
5260 EVT XVT = X.getValueType();
5262 if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND ||
5263 HandOpcode == ISD::SIGN_EXTEND) {
5264 // If both operands have other uses, this transform would create extra
5265 // instructions without eliminating anything.
5266 if (!N0.hasOneUse() && !N1.hasOneUse())
5268 // We need matching integer source types.
5269 if (XVT != Y.getValueType())
5271 // Don't create an illegal op during or after legalization. Don't ever
5272 // create an unsupported vector op.
5273 if ((VT.isVector() || LegalOperations) &&
5274 !TLI.isOperationLegalOrCustom(LogicOpcode, XVT))
5276 // Avoid infinite looping with PromoteIntBinOp.
5277 // TODO: Should we apply desirable/legal constraints to all opcodes?
5278 if (HandOpcode == ISD::ANY_EXTEND && LegalTypes &&
5279 !TLI.isTypeDesirableForOp(LogicOpcode, XVT))
5281 // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
5282 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5283 return DAG.getNode(HandOpcode, DL, VT, Logic);
5286 // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
5287 if (HandOpcode == ISD::TRUNCATE) {
5288 // If both operands have other uses, this transform would create extra
5289 // instructions without eliminating anything.
5290 if (!N0.hasOneUse() && !N1.hasOneUse())
5292 // We need matching source types.
5293 if (XVT != Y.getValueType())
5295 // Don't create an illegal op during or after legalization.
5296 if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
5298 // Be extra careful sinking truncate. If it's free, there's no benefit in
5299 // widening a binop. Also, don't create a logic op on an illegal type.
5300 if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
5302 if (!TLI.isTypeLegal(XVT))
5304 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5305 return DAG.getNode(HandOpcode, DL, VT, Logic);
5308 // For binops SHL/SRL/SRA/AND:
5309 // logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
5310 if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL ||
5311 HandOpcode == ISD::SRA || HandOpcode == ISD::AND) &&
5312 N0.getOperand(1) == N1.getOperand(1)) {
5313 // If either operand has other uses, this transform is not an improvement.
5314 if (!N0.hasOneUse() || !N1.hasOneUse())
5316 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5317 return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1));
5320 // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
5321 if (HandOpcode == ISD::BSWAP) {
5322 // If either operand has other uses, this transform is not an improvement.
5323 if (!N0.hasOneUse() || !N1.hasOneUse())
5325 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5326 return DAG.getNode(HandOpcode, DL, VT, Logic);
5329 // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
5330 // Only perform this optimization up until type legalization, before
5331 // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
5332 // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
5333 // we don't want to undo this promotion.
5334 // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
5336 if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) &&
5337 Level <= AfterLegalizeTypes) {
5338 // Input types must be integer and the same.
5339 if (XVT.isInteger() && XVT == Y.getValueType() &&
5340 !(VT.isVector() && TLI.isTypeLegal(VT) &&
5341 !XVT.isVector() && !TLI.isTypeLegal(XVT))) {
5342 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5343 return DAG.getNode(HandOpcode, DL, VT, Logic);
5347 // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
5348 // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
5349 // If both shuffles use the same mask, and both shuffle within a single
5350 // vector, then it is worthwhile to move the swizzle after the operation.
5351 // The type-legalizer generates this pattern when loading illegal
5352 // vector types from memory. In many cases this allows additional shuffle
5354 // There are other cases where moving the shuffle after the xor/and/or
5355 // is profitable even if shuffles don't perform a swizzle.
5356 // If both shuffles use the same mask, and both shuffles have the same first
5357 // or second operand, then it might still be profitable to move the shuffle
5358 // after the xor/and/or operation.
5359 if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
5360 auto *SVN0 = cast<ShuffleVectorSDNode>(N0);
5361 auto *SVN1 = cast<ShuffleVectorSDNode>(N1);
5362 assert(X.getValueType() == Y.getValueType() &&
5363 "Inputs to shuffles are not the same type");
5365 // Check that both shuffles use the same mask. The masks are known to be of
5366 // the same length because the result vector type is the same.
5367 // Check also that shuffles have only one use to avoid introducing extra
5369 if (!SVN0->hasOneUse() || !SVN1->hasOneUse() ||
5370 !SVN0->getMask().equals(SVN1->getMask()))
5373 // Don't try to fold this node if it requires introducing a
5374 // build vector of all zeros that might be illegal at this stage.
5375 SDValue ShOp = N0.getOperand(1);
5376 if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
5377 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
5379 // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C)
5380 if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
5381 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT,
5382 N0.getOperand(0), N1.getOperand(0));
5383 return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask());
5386 // Don't try to fold this node if it requires introducing a
5387 // build vector of all zeros that might be illegal at this stage.
5388 ShOp = N0.getOperand(0);
5389 if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
5390 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
5392 // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B))
5393 if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) {
5394 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1),
5396 return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask());
5403 /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
5404 SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
5406 SDValue LL, LR, RL, RR, N0CC, N1CC;
5407 if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
5408 !isSetCCEquivalent(N1, RL, RR, N1CC))
5411 assert(N0.getValueType() == N1.getValueType() &&
5412 "Unexpected operand types for bitwise logic op");
5413 assert(LL.getValueType() == LR.getValueType() &&
5414 RL.getValueType() == RR.getValueType() &&
5415 "Unexpected operand types for setcc");
5417 // If we're here post-legalization or the logic op type is not i1, the logic
5418 // op type must match a setcc result type. Also, all folds require new
5419 // operations on the left and right operands, so those types must match.
5420 EVT VT = N0.getValueType();
5421 EVT OpVT = LL.getValueType();
5422 if (LegalOperations || VT.getScalarType() != MVT::i1)
5423 if (VT != getSetCCResultType(OpVT))
5425 if (OpVT != RL.getValueType())
5428 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
5429 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
5430 bool IsInteger = OpVT.isInteger();
5431 if (LR == RR && CC0 == CC1 && IsInteger) {
5432 bool IsZero = isNullOrNullSplat(LR);
5433 bool IsNeg1 = isAllOnesOrAllOnesSplat(LR);
5436 bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
5437 // All sign bits clear?
5438 bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
5440 bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
5441 // Any sign bits set?
5442 bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
5444 // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0)
5445 // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
5446 // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0)
5447 // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0)
5448 if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
5449 SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
5450 AddToWorklist(Or.getNode());
5451 return DAG.getSetCC(DL, VT, Or, LR, CC1);
5455 bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
5456 // All sign bits set?
5457 bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
5459 bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
5460 // Any sign bits clear?
5461 bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
5463 // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
5464 // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0)
5465 // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
5466 // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1)
5467 if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
5468 SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
5469 AddToWorklist(And.getNode());
5470 return DAG.getSetCC(DL, VT, And, LR, CC1);
5474 // TODO: What is the 'or' equivalent of this fold?
5475 // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
5476 if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
5477 IsInteger && CC0 == ISD::SETNE &&
5478 ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
5479 (isAllOnesConstant(LR) && isNullConstant(RR)))) {
5480 SDValue One = DAG.getConstant(1, DL, OpVT);
5481 SDValue Two = DAG.getConstant(2, DL, OpVT);
5482 SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
5483 AddToWorklist(Add.getNode());
5484 return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
5487 // Try more general transforms if the predicates match and the only user of
5488 // the compares is the 'and' or 'or'.
5489 if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
5490 N0.hasOneUse() && N1.hasOneUse()) {
5491 // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
5492 // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
5493 if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
5494 SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
5495 SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
5496 SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
5497 SDValue Zero = DAG.getConstant(0, DL, OpVT);
5498 return DAG.getSetCC(DL, VT, Or, Zero, CC1);
5501 // Turn compare of constants whose difference is 1 bit into add+and+setcc.
5502 if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) {
5503 // Match a shared variable operand and 2 non-opaque constant operands.
5504 auto MatchDiffPow2 = [&](ConstantSDNode *C0, ConstantSDNode *C1) {
5505 // The difference of the constants must be a single bit.
5507 APIntOps::umax(C0->getAPIntValue(), C1->getAPIntValue());
5509 APIntOps::umin(C0->getAPIntValue(), C1->getAPIntValue());
5510 return !C0->isOpaque() && !C1->isOpaque() && (CMax - CMin).isPowerOf2();
5512 if (LL == RL && ISD::matchBinaryPredicate(LR, RR, MatchDiffPow2)) {
5513 // and/or (setcc X, CMax, ne), (setcc X, CMin, ne/eq) -->
5514 // setcc ((sub X, CMin), ~(CMax - CMin)), 0, ne/eq
5515 SDValue Max = DAG.getNode(ISD::UMAX, DL, OpVT, LR, RR);
5516 SDValue Min = DAG.getNode(ISD::UMIN, DL, OpVT, LR, RR);
5517 SDValue Offset = DAG.getNode(ISD::SUB, DL, OpVT, LL, Min);
5518 SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, Max, Min);
5519 SDValue Mask = DAG.getNOT(DL, Diff, OpVT);
5520 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Offset, Mask);
5521 SDValue Zero = DAG.getConstant(0, DL, OpVT);
5522 return DAG.getSetCC(DL, VT, And, Zero, CC0);
5527 // Canonicalize equivalent operands to LL == RL.
5528 if (LL == RR && LR == RL) {
5529 CC1 = ISD::getSetCCSwappedOperands(CC1);
5533 // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
5534 // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
5535 if (LL == RL && LR == RR) {
5536 ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT)
5537 : ISD::getSetCCOrOperation(CC0, CC1, OpVT);
5538 if (NewCC != ISD::SETCC_INVALID &&
5539 (!LegalOperations ||
5540 (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
5541 TLI.isOperationLegal(ISD::SETCC, OpVT))))
5542 return DAG.getSetCC(DL, VT, LL, LR, NewCC);
5548 /// This contains all DAGCombine rules which reduce two values combined by
5549 /// an And operation to a single value. This makes them reusable in the context
5550 /// of visitSELECT(). Rules involving constants are not included as
5551 /// visitSELECT() already handles those cases.
5552 SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
5553 EVT VT = N1.getValueType();
5556 // fold (and x, undef) -> 0
5557 if (N0.isUndef() || N1.isUndef())
5558 return DAG.getConstant(0, DL, VT);
5560 if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
5563 // TODO: Rewrite this to return a new 'AND' instead of using CombineTo.
5564 if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
5565 VT.getSizeInBits() <= 64 && N0->hasOneUse()) {
5566 if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
5567 if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
5568 // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
5569 // immediate for an add, but it is legal if its top c2 bits are set,
5570 // transform the ADD so the immediate doesn't need to be materialized
5572 APInt ADDC = ADDI->getAPIntValue();
5573 APInt SRLC = SRLI->getAPIntValue();
5574 if (ADDC.getMinSignedBits() <= 64 &&
5575 SRLC.ult(VT.getSizeInBits()) &&
5576 !TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
5577 APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
5578 SRLC.getZExtValue());
5579 if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
5581 if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
5584 DAG.getNode(ISD::ADD, DL0, VT,
5585 N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
5586 CombineTo(N0.getNode(), NewAdd);
5587 // Return N so it doesn't get rechecked!
5588 return SDValue(N, 0);
5596 // Reduce bit extract of low half of an integer to the narrower type.
5597 // (and (srl i64:x, K), KMask) ->
5598 // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
5599 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
5600 if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
5601 if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
5602 unsigned Size = VT.getSizeInBits();
5603 const APInt &AndMask = CAnd->getAPIntValue();
5604 unsigned ShiftBits = CShift->getZExtValue();
5606 // Bail out, this node will probably disappear anyway.
5610 unsigned MaskBits = AndMask.countTrailingOnes();
5611 EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
5613 if (AndMask.isMask() &&
5614 // Required bits must not span the two halves of the integer and
5615 // must fit in the half size type.
5616 (ShiftBits + MaskBits <= Size / 2) &&
5617 TLI.isNarrowingProfitable(VT, HalfVT) &&
5618 TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
5619 TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
5620 TLI.isTruncateFree(VT, HalfVT) &&
5621 TLI.isZExtFree(HalfVT, VT)) {
5622 // The isNarrowingProfitable is to avoid regressions on PPC and
5623 // AArch64 which match a few 64-bit bit insert / bit extract patterns
5624 // on downstream users of this. Those patterns could probably be
5625 // extended to handle extensions mixed in.
5628 assert(MaskBits <= Size);
5630 // Extracting the highest bit of the low half.
5631 EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
5632 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
5635 SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
5636 SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
5637 SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
5638 SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
5639 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
5648 bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
5649 EVT LoadResultTy, EVT &ExtVT) {
5650 if (!AndC->getAPIntValue().isMask())
5653 unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
5655 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
5656 EVT LoadedVT = LoadN->getMemoryVT();
5658 if (ExtVT == LoadedVT &&
5659 (!LegalOperations ||
5660 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
5661 // ZEXTLOAD will match without needing to change the size of the value being
5666 // Do not change the width of a volatile or atomic loads.
5667 if (!LoadN->isSimple())
5670 // Do not generate loads of non-round integer types since these can
5671 // be expensive (and would be wrong if the type is not byte sized).
5672 if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound())
5675 if (LegalOperations &&
5676 !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
5679 if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
5685 bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
5686 ISD::LoadExtType ExtType, EVT &MemVT,
5690 // Only allow byte offsets.
5694 // Do not generate loads of non-round integer types since these can
5695 // be expensive (and would be wrong if the type is not byte sized).
5696 if (!MemVT.isRound())
5699 // Don't change the width of a volatile or atomic loads.
5700 if (!LDST->isSimple())
5703 EVT LdStMemVT = LDST->getMemoryVT();
5705 // Bail out when changing the scalable property, since we can't be sure that
5706 // we're actually narrowing here.
5707 if (LdStMemVT.isScalableVector() != MemVT.isScalableVector())
5710 // Verify that we are actually reducing a load width here.
5711 if (LdStMemVT.bitsLT(MemVT))
5714 // Ensure that this isn't going to produce an unsupported memory access.
5716 assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
5717 const unsigned ByteShAmt = ShAmt / 8;
5718 const Align LDSTAlign = LDST->getAlign();
5719 const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
5720 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
5721 LDST->getAddressSpace(), NarrowAlign,
5722 LDST->getMemOperand()->getFlags()))
5726 // It's not possible to generate a constant of extended or untyped type.
5727 EVT PtrType = LDST->getBasePtr().getValueType();
5728 if (PtrType == MVT::Untyped || PtrType.isExtended())
5731 if (isa<LoadSDNode>(LDST)) {
5732 LoadSDNode *Load = cast<LoadSDNode>(LDST);
5733 // Don't transform one with multiple uses, this would require adding a new
5735 if (!SDValue(Load, 0).hasOneUse())
5738 if (LegalOperations &&
5739 !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT))
5742 // For the transform to be legal, the load must produce only two values
5743 // (the value loaded and the chain). Don't transform a pre-increment
5744 // load, for example, which produces an extra value. Otherwise the
5745 // transformation is not equivalent, and the downstream logic to replace
5746 // uses gets things wrong.
5747 if (Load->getNumValues() > 2)
5750 // If the load that we're shrinking is an extload and we're not just
5751 // discarding the extension we can't simply shrink the load. Bail.
5752 // TODO: It would be possible to merge the extensions in some cases.
5753 if (Load->getExtensionType() != ISD::NON_EXTLOAD &&
5754 Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
5757 if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
5760 assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
5761 StoreSDNode *Store = cast<StoreSDNode>(LDST);
5762 // Can't write outside the original store
5763 if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
5766 if (LegalOperations &&
5767 !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT))
5773 bool DAGCombiner::SearchForAndLoads(SDNode *N,
5774 SmallVectorImpl<LoadSDNode*> &Loads,
5775 SmallPtrSetImpl<SDNode*> &NodesWithConsts,
5776 ConstantSDNode *Mask,
5777 SDNode *&NodeToMask) {
5778 // Recursively search for the operands, looking for loads which can be
5780 for (SDValue Op : N->op_values()) {
5781 if (Op.getValueType().isVector())
5784 // Some constants may need fixing up later if they are too large.
5785 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
5786 if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
5787 (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
5788 NodesWithConsts.insert(N);
5792 if (!Op.hasOneUse())
5795 switch(Op.getOpcode()) {
5797 auto *Load = cast<LoadSDNode>(Op);
5799 if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
5800 isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) {
5802 // ZEXTLOAD is already small enough.
5803 if (Load->getExtensionType() == ISD::ZEXTLOAD &&
5804 ExtVT.bitsGE(Load->getMemoryVT()))
5807 // Use LE to convert equal sized loads to zext.
5808 if (ExtVT.bitsLE(Load->getMemoryVT()))
5809 Loads.push_back(Load);
5815 case ISD::ZERO_EXTEND:
5816 case ISD::AssertZext: {
5817 unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
5818 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
5819 EVT VT = Op.getOpcode() == ISD::AssertZext ?
5820 cast<VTSDNode>(Op.getOperand(1))->getVT() :
5821 Op.getOperand(0).getValueType();
5823 // We can accept extending nodes if the mask is wider or an equal
5824 // width to the original type.
5825 if (ExtVT.bitsGE(VT))
5832 if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
5838 // Allow one node which will masked along with any loads found.
5842 // Also ensure that the node to be masked only produces one data result.
5843 NodeToMask = Op.getNode();
5844 if (NodeToMask->getNumValues() > 1) {
5845 bool HasValue = false;
5846 for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) {
5847 MVT VT = SDValue(NodeToMask, i).getSimpleValueType();
5848 if (VT != MVT::Glue && VT != MVT::Other) {
5850 NodeToMask = nullptr;
5856 assert(HasValue && "Node to be masked has no data result?");
5862 bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
5863 auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
5867 if (!Mask->getAPIntValue().isMask())
5870 // No need to do anything if the and directly uses a load.
5871 if (isa<LoadSDNode>(N->getOperand(0)))
5874 SmallVector<LoadSDNode*, 8> Loads;
5875 SmallPtrSet<SDNode*, 2> NodesWithConsts;
5876 SDNode *FixupNode = nullptr;
5877 if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
5878 if (Loads.size() == 0)
5881 LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
5882 SDValue MaskOp = N->getOperand(1);
5884 // If it exists, fixup the single node we allow in the tree that needs
5887 LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
5888 SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
5889 FixupNode->getValueType(0),
5890 SDValue(FixupNode, 0), MaskOp);
5891 DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
5892 if (And.getOpcode() == ISD ::AND)
5893 DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
5896 // Narrow any constants that need it.
5897 for (auto *LogicN : NodesWithConsts) {
5898 SDValue Op0 = LogicN->getOperand(0);
5899 SDValue Op1 = LogicN->getOperand(1);
5901 if (isa<ConstantSDNode>(Op0))
5902 std::swap(Op0, Op1);
5904 SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
5907 DAG.UpdateNodeOperands(LogicN, Op0, And);
5910 // Create narrow loads.
5911 for (auto *Load : Loads) {
5912 LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
5913 SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
5914 SDValue(Load, 0), MaskOp);
5915 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
5916 if (And.getOpcode() == ISD ::AND)
5918 DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
5919 SDValue NewLoad = reduceLoadWidth(And.getNode());
5921 "Shouldn't be masking the load if it can't be narrowed");
5922 CombineTo(Load, NewLoad, NewLoad.getValue(1));
5924 DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
5931 // x & (-1 'logical shift' y)
5933 // (x 'opposite logical shift' y) 'logical shift' y
5934 // if it is better for performance.
5935 SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
5936 assert(N->getOpcode() == ISD::AND);
5938 SDValue N0 = N->getOperand(0);
5939 SDValue N1 = N->getOperand(1);
5941 // Do we actually prefer shifts over mask?
5942 if (!TLI.shouldFoldMaskToVariableShiftPair(N0))
5945 // Try to match (-1 '[outer] logical shift' y)
5946 unsigned OuterShift;
5947 unsigned InnerShift; // The opposite direction to the OuterShift.
5948 SDValue Y; // Shift amount.
5949 auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool {
5952 OuterShift = M->getOpcode();
5953 if (OuterShift == ISD::SHL)
5954 InnerShift = ISD::SRL;
5955 else if (OuterShift == ISD::SRL)
5956 InnerShift = ISD::SHL;
5959 if (!isAllOnesConstant(M->getOperand(0)))
5961 Y = M->getOperand(1);
5968 else if (matchMask(N0))
5974 EVT VT = N->getValueType(0);
5976 // tmp = x 'opposite logical shift' y
5977 SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y);
5978 // ret = tmp 'logical shift' y
5979 SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y);
5984 /// Try to replace shift/logic that tests if a bit is clear with mask + setcc.
5985 /// For a target with a bit test, this is expected to become test + set and save
5986 /// at least 1 instruction.
5987 static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
5988 assert(And->getOpcode() == ISD::AND && "Expected an 'and' op");
5990 // This is probably not worthwhile without a supported type.
5991 EVT VT = And->getValueType(0);
5992 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5993 if (!TLI.isTypeLegal(VT))
5996 // Look through an optional extension.
5997 SDValue And0 = And->getOperand(0), And1 = And->getOperand(1);
5998 if (And0.getOpcode() == ISD::ANY_EXTEND && And0.hasOneUse())
5999 And0 = And0.getOperand(0);
6000 if (!isOneConstant(And1) || !And0.hasOneUse())
6005 // Attempt to find a 'not' op.
6006 // TODO: Should we favor test+set even without the 'not' op?
6007 bool FoundNot = false;
6008 if (isBitwiseNot(Src)) {
6010 Src = Src.getOperand(0);
6012 // Look though an optional truncation. The source operand may not be the
6013 // same type as the original 'and', but that is ok because we are masking
6014 // off everything but the low bit.
6015 if (Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse())
6016 Src = Src.getOperand(0);
6019 // Match a shift-right by constant.
6020 if (Src.getOpcode() != ISD::SRL || !Src.hasOneUse())
6023 // We might have looked through casts that make this transform invalid.
6024 // TODO: If the source type is wider than the result type, do the mask and
6025 // compare in the source type.
6026 unsigned VTBitWidth = VT.getScalarSizeInBits();
6027 SDValue ShiftAmt = Src.getOperand(1);
6028 auto *ShiftAmtC = dyn_cast<ConstantSDNode>(ShiftAmt);
6029 if (!ShiftAmtC || !ShiftAmtC->getAPIntValue().ult(VTBitWidth))
6032 // Set source to shift source.
6033 Src = Src.getOperand(0);
6035 // Try again to find a 'not' op.
6036 // TODO: Should we favor test+set even with two 'not' ops?
6038 if (!isBitwiseNot(Src))
6040 Src = Src.getOperand(0);
6043 if (!TLI.hasBitTest(Src, ShiftAmt))
6046 // Turn this into a bit-test pattern using mask op + setcc:
6047 // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
6048 // and (srl (not X), C)), 1 --> (and X, 1<<C) == 0
6050 SDValue X = DAG.getZExtOrTrunc(Src, DL, VT);
6051 EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
6052 SDValue Mask = DAG.getConstant(
6053 APInt::getOneBitSet(VTBitWidth, ShiftAmtC->getZExtValue()), DL, VT);
6054 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask);
6055 SDValue Zero = DAG.getConstant(0, DL, VT);
6056 SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
6057 return DAG.getZExtOrTrunc(Setcc, DL, VT);
6060 /// For targets that support usubsat, match a bit-hack form of that operation
6061 /// that ends in 'and' and convert it.
6062 static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
6063 SDValue N0 = N->getOperand(0);
6064 SDValue N1 = N->getOperand(1);
6065 EVT VT = N1.getValueType();
6067 // Canonicalize SRA as operand 1.
6068 if (N0.getOpcode() == ISD::SRA)
6071 // xor/add with SMIN (signmask) are logically equivalent.
6072 if (N0.getOpcode() != ISD::XOR && N0.getOpcode() != ISD::ADD)
6075 if (N1.getOpcode() != ISD::SRA || !N0.hasOneUse() || !N1.hasOneUse() ||
6076 N0.getOperand(0) != N1.getOperand(0))
6079 unsigned BitWidth = VT.getScalarSizeInBits();
6080 ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true);
6081 ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true);
6082 if (!XorC || !XorC->getAPIntValue().isSignMask() ||
6083 !SraC || SraC->getAPIntValue() != BitWidth - 1)
6086 // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128
6087 // (i8 X + 128) & (i8 X s>> 7) --> usubsat X, 128
6089 SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT);
6090 return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
6093 /// Given a bitwise logic operation N with a matching bitwise logic operand,
6094 /// fold a pattern where 2 of the source operands are identically shifted
6095 /// values. For example:
6096 /// ((X0 << Y) | Z) | (X1 << Y) --> ((X0 | X1) << Y) | Z
6097 static SDValue foldLogicOfShifts(SDNode *N, SDValue LogicOp, SDValue ShiftOp,
6098 SelectionDAG &DAG) {
6099 unsigned LogicOpcode = N->getOpcode();
6100 assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR ||
6101 LogicOpcode == ISD::XOR)
6102 && "Expected bitwise logic operation");
6104 if (!LogicOp.hasOneUse() || !ShiftOp.hasOneUse())
6107 // Match another bitwise logic op and a shift.
6108 unsigned ShiftOpcode = ShiftOp.getOpcode();
6109 if (LogicOp.getOpcode() != LogicOpcode ||
6110 !(ShiftOpcode == ISD::SHL || ShiftOpcode == ISD::SRL ||
6111 ShiftOpcode == ISD::SRA))
6114 // Match another shift op inside the first logic operand. Handle both commuted
6116 // LOGIC (LOGIC (SH X0, Y), Z), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
6117 // LOGIC (LOGIC Z, (SH X0, Y)), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
6118 SDValue X1 = ShiftOp.getOperand(0);
6119 SDValue Y = ShiftOp.getOperand(1);
6121 if (LogicOp.getOperand(0).getOpcode() == ShiftOpcode &&
6122 LogicOp.getOperand(0).getOperand(1) == Y) {
6123 X0 = LogicOp.getOperand(0).getOperand(0);
6124 Z = LogicOp.getOperand(1);
6125 } else if (LogicOp.getOperand(1).getOpcode() == ShiftOpcode &&
6126 LogicOp.getOperand(1).getOperand(1) == Y) {
6127 X0 = LogicOp.getOperand(1).getOperand(0);
6128 Z = LogicOp.getOperand(0);
6133 EVT VT = N->getValueType(0);
6135 SDValue LogicX = DAG.getNode(LogicOpcode, DL, VT, X0, X1);
6136 SDValue NewShift = DAG.getNode(ShiftOpcode, DL, VT, LogicX, Y);
6137 return DAG.getNode(LogicOpcode, DL, VT, NewShift, Z);
6140 SDValue DAGCombiner::visitAND(SDNode *N) {
6141 SDValue N0 = N->getOperand(0);
6142 SDValue N1 = N->getOperand(1);
6143 EVT VT = N1.getValueType();
6149 // fold (and c1, c2) -> c1&c2
6150 if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
6153 // canonicalize constant to RHS
6154 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
6155 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
6156 return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
6159 if (VT.isVector()) {
6160 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
6163 // fold (and x, 0) -> 0, vector edition
6164 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
6165 // do not return N1, because undef node may exist in N1
6166 return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()),
6167 SDLoc(N), N1.getValueType());
6169 // fold (and x, -1) -> x, vector edition
6170 if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
6173 // fold (and (masked_load) (splat_vec (x, ...))) to zext_masked_load
6174 auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0);
6175 ConstantSDNode *Splat = isConstOrConstSplat(N1, true, true);
6176 if (MLoad && MLoad->getExtensionType() == ISD::EXTLOAD && N0.hasOneUse() &&
6177 Splat && N1.hasOneUse()) {
6178 EVT LoadVT = MLoad->getMemoryVT();
6180 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) {
6181 // For this AND to be a zero extension of the masked load the elements
6182 // of the BuildVec must mask the bottom bits of the extended element
6184 uint64_t ElementSize =
6185 LoadVT.getVectorElementType().getScalarSizeInBits();
6186 if (Splat->getAPIntValue().isMask(ElementSize)) {
6187 return DAG.getMaskedLoad(
6188 ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
6189 MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
6190 LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
6191 ISD::ZEXTLOAD, MLoad->isExpandingLoad());
6197 // fold (and x, -1) -> x
6198 if (isAllOnesConstant(N1))
6201 // if (and x, c) is known to be zero, return 0
6202 unsigned BitWidth = VT.getScalarSizeInBits();
6203 ConstantSDNode *N1C = isConstOrConstSplat(N1);
6204 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
6205 return DAG.getConstant(0, SDLoc(N), VT);
6207 if (SDValue NewSel = foldBinOpIntoSelect(N))
6211 if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
6214 // Try to convert a constant mask AND into a shuffle clear mask.
6216 if (SDValue Shuffle = XformToShuffleWithZero(N))
6219 if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
6222 // fold (and (or x, C), D) -> D if (C & D) == D
6223 auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
6224 return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
6226 if (N0.getOpcode() == ISD::OR &&
6227 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
6229 // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
6230 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
6231 SDValue N0Op0 = N0.getOperand(0);
6232 APInt Mask = ~N1C->getAPIntValue();
6233 Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits());
6234 if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
6235 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
6236 N0.getValueType(), N0Op0);
6238 // Replace uses of the AND with uses of the Zero extend node.
6241 // We actually want to replace all uses of the any_extend with the
6242 // zero_extend, to avoid duplicating things. This will later cause this
6243 // AND to be folded.
6244 CombineTo(N0.getNode(), Zext);
6245 return SDValue(N, 0); // Return N so it doesn't get rechecked!
6249 // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
6250 // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
6251 // already be zero by virtue of the width of the base type of the load.
6253 // the 'X' node here can either be nothing or an extract_vector_elt to catch
6255 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6256 N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
6257 N0.getOperand(0).getOpcode() == ISD::LOAD &&
6258 N0.getOperand(0).getResNo() == 0) ||
6259 (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
6260 LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
6261 N0 : N0.getOperand(0) );
6263 // Get the constant (if applicable) the zero'th operand is being ANDed with.
6264 // This can be a pure constant or a vector splat, in which case we treat the
6265 // vector as a scalar and use the splat value.
6266 APInt Constant = APInt::getZero(1);
6267 if (const ConstantSDNode *C = isConstOrConstSplat(
6268 N1, /*AllowUndef=*/false, /*AllowTruncation=*/true)) {
6269 Constant = C->getAPIntValue();
6270 } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
6271 APInt SplatValue, SplatUndef;
6272 unsigned SplatBitSize;
6274 bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
6275 SplatBitSize, HasAnyUndefs);
6277 // Undef bits can contribute to a possible optimisation if set, so
6279 SplatValue |= SplatUndef;
6281 // The splat value may be something like "0x00FFFFFF", which means 0 for
6282 // the first vector value and FF for the rest, repeating. We need a mask
6283 // that will apply equally to all members of the vector, so AND all the
6284 // lanes of the constant together.
6285 unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits();
6287 // If the splat value has been compressed to a bitlength lower
6288 // than the size of the vector lane, we need to re-expand it to
6290 if (EltBitWidth > SplatBitSize)
6291 for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth);
6292 SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2)
6293 SplatValue |= SplatValue.shl(SplatBitSize);
6295 // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
6296 // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
6297 if ((SplatBitSize % EltBitWidth) == 0) {
6298 Constant = APInt::getAllOnes(EltBitWidth);
6299 for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i)
6300 Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth);
6305 // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
6306 // actually legal and isn't going to get expanded, else this is a false
6308 bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
6309 Load->getValueType(0),
6310 Load->getMemoryVT());
6312 // Resize the constant to the same size as the original memory access before
6313 // extension. If it is still the AllOnesValue then this AND is completely
6315 Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());
6318 switch (Load->getExtensionType()) {
6319 default: B = false; break;
6320 case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
6322 case ISD::NON_EXTLOAD: B = true; break;
6325 if (B && Constant.isAllOnes()) {
6326 // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
6327 // preserve semantics once we get rid of the AND.
6328 SDValue NewLoad(Load, 0);
6330 // Fold the AND away. NewLoad may get replaced immediately.
6331 CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
6333 if (Load->getExtensionType() == ISD::EXTLOAD) {
6334 NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
6335 Load->getValueType(0), SDLoc(Load),
6336 Load->getChain(), Load->getBasePtr(),
6337 Load->getOffset(), Load->getMemoryVT(),
6338 Load->getMemOperand());
6339 // Replace uses of the EXTLOAD with the new ZEXTLOAD.
6340 if (Load->getNumValues() == 3) {
6341 // PRE/POST_INC loads have 3 values.
6342 SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
6343 NewLoad.getValue(2) };
6344 CombineTo(Load, To, 3, true);
6346 CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
6350 return SDValue(N, 0); // Return N so it doesn't get rechecked!
6354 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() && N1C &&
6355 ISD::isExtOpcode(N0.getOperand(0).getOpcode())) {
6356 SDValue Ext = N0.getOperand(0);
6357 EVT ExtVT = Ext->getValueType(0);
6358 SDValue Extendee = Ext->getOperand(0);
6360 unsigned ScalarWidth = Extendee.getValueType().getScalarSizeInBits();
6361 if (N1C->getAPIntValue().isMask(ScalarWidth)) {
6362 // (and (extract_subvector (zext|anyext|sext v) _) iN_mask)
6363 // => (extract_subvector (iN_zeroext v))
6364 SDValue ZeroExtExtendee =
6365 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), ExtVT, Extendee);
6367 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, ZeroExtExtendee,
6372 // fold (and (masked_gather x)) -> (zext_masked_gather x)
6373 if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
6374 EVT MemVT = GN0->getMemoryVT();
6375 EVT ScalarVT = MemVT.getScalarType();
6377 if (SDValue(GN0, 0).hasOneUse() &&
6378 isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
6379 TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
6380 SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
6381 GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
6383 SDValue ZExtLoad = DAG.getMaskedGather(
6384 DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
6385 GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
6387 CombineTo(N, ZExtLoad);
6388 AddToWorklist(ZExtLoad.getNode());
6389 // Avoid recheck of N.
6390 return SDValue(N, 0);
6394 // fold (and (load x), 255) -> (zextload x, i8)
6395 // fold (and (extload x, i16), 255) -> (zextload x, i8)
6396 if (N1C && N0.getOpcode() == ISD::LOAD && !VT.isVector())
6397 if (SDValue Res = reduceLoadWidth(N))
6401 // Attempt to propagate the AND back up to the leaves which, if they're
6402 // loads, can be combined to narrow loads and the AND node can be removed.
6403 // Perform after legalization so that extend nodes will already be
6404 // combined into the loads.
6405 if (BackwardsPropagateMask(N))
6406 return SDValue(N, 0);
6409 if (SDValue Combined = visitANDLike(N0, N1, N))
6412 // Simplify: (and (op x...), (op y...)) -> (op (and x, y))
6413 if (N0.getOpcode() == N1.getOpcode())
6414 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
6417 if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
6419 if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
6422 // Masking the negated extension of a boolean is just the zero-extended
6424 // and (sub 0, zext(bool X)), 1 --> zext(bool X)
6425 // and (sub 0, sext(bool X)), 1 --> zext(bool X)
6427 // Note: the SimplifyDemandedBits fold below can make an information-losing
6428 // transform, and then we have no way to find this better fold.
6429 if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
6430 if (isNullOrNullSplat(N0.getOperand(0))) {
6431 SDValue SubRHS = N0.getOperand(1);
6432 if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
6433 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
6435 if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
6436 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
6437 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
6441 // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
6442 // fold (and (sra)) -> (and (srl)) when possible.
6443 if (SimplifyDemandedBits(SDValue(N, 0)))
6444 return SDValue(N, 0);
6446 // fold (zext_inreg (extload x)) -> (zextload x)
6447 // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
6448 if (ISD::isUNINDEXEDLoad(N0.getNode()) &&
6449 (ISD::isEXTLoad(N0.getNode()) ||
6450 (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) {
6451 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
6452 EVT MemVT = LN0->getMemoryVT();
6453 // If we zero all the possible extended bits, then we can turn this into
6454 // a zextload if we are running before legalize or the operation is legal.
6455 unsigned ExtBitSize = N1.getScalarValueSizeInBits();
6456 unsigned MemBitSize = MemVT.getScalarSizeInBits();
6457 APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
6458 if (DAG.MaskedValueIsZero(N1, ExtBits) &&
6459 ((!LegalOperations && LN0->isSimple()) ||
6460 TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
6462 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
6463 LN0->getBasePtr(), MemVT, LN0->getMemOperand());
6465 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
6466 return SDValue(N, 0); // Return N so it doesn't get rechecked!
6470 // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
6471 if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
6472 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
6473 N0.getOperand(1), false))
6477 if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
6480 if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
6483 // Recognize the following pattern:
6485 // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask)
6487 // where bitmask is a mask that clears the upper bits of AndVT. The
6488 // number of bits in bitmask must be a power of two.
6489 auto IsAndZeroExtMask = [](SDValue LHS, SDValue RHS) {
6490 if (LHS->getOpcode() != ISD::SIGN_EXTEND)
6493 auto *C = dyn_cast<ConstantSDNode>(RHS);
6497 if (!C->getAPIntValue().isMask(
6498 LHS.getOperand(0).getValueType().getFixedSizeInBits()))
6504 // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...).
6505 if (IsAndZeroExtMask(N0, N1))
6506 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
6508 if (hasOperation(ISD::USUBSAT, VT))
6509 if (SDValue V = foldAndToUsubsat(N, DAG))
6515 /// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
6516 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
6517 bool DemandHighBits) {
6518 if (!LegalOperations)
6521 EVT VT = N->getValueType(0);
6522 if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
6524 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
6527 // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
6528 bool LookPassAnd0 = false;
6529 bool LookPassAnd1 = false;
6530 if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
6532 if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
6534 if (N0.getOpcode() == ISD::AND) {
6535 if (!N0->hasOneUse())
6537 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6538 // Also handle 0xffff since the LHS is guaranteed to have zeros there.
6539 // This is needed for X86.
6540 if (!N01C || (N01C->getZExtValue() != 0xFF00 &&
6541 N01C->getZExtValue() != 0xFFFF))
6543 N0 = N0.getOperand(0);
6544 LookPassAnd0 = true;
6547 if (N1.getOpcode() == ISD::AND) {
6548 if (!N1->hasOneUse())
6550 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
6551 if (!N11C || N11C->getZExtValue() != 0xFF)
6553 N1 = N1.getOperand(0);
6554 LookPassAnd1 = true;
6557 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
6559 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
6561 if (!N0->hasOneUse() || !N1->hasOneUse())
6564 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6565 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
6568 if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8)
6571 // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
6572 SDValue N00 = N0->getOperand(0);
6573 if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
6574 if (!N00->hasOneUse())
6576 ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
6577 if (!N001C || N001C->getZExtValue() != 0xFF)
6579 N00 = N00.getOperand(0);
6580 LookPassAnd0 = true;
6583 SDValue N10 = N1->getOperand(0);
6584 if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
6585 if (!N10->hasOneUse())
6587 ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
6588 // Also allow 0xFFFF since the bits will be shifted out. This is needed
6590 if (!N101C || (N101C->getZExtValue() != 0xFF00 &&
6591 N101C->getZExtValue() != 0xFFFF))
6593 N10 = N10.getOperand(0);
6594 LookPassAnd1 = true;
6600 // Make sure everything beyond the low halfword gets set to zero since the SRL
6601 // 16 will clear the top bits.
6602 unsigned OpSizeInBits = VT.getSizeInBits();
6603 if (OpSizeInBits > 16) {
6604 // If the left-shift isn't masked out then the only way this is a bswap is
6605 // if all bits beyond the low 8 are 0. In that case the entire pattern
6606 // reduces to a left shift anyway: leave it for other parts of the combiner.
6607 if (DemandHighBits && !LookPassAnd0)
6610 // However, if the right shift isn't masked out then it might be because
6611 // it's not needed. See if we can spot that too. If the high bits aren't
6612 // demanded, we only need bits 23:16 to be zero. Otherwise, we need all
6613 // upper bits to be zero.
6614 if (!LookPassAnd1) {
6615 unsigned HighBit = DemandHighBits ? OpSizeInBits : 24;
6616 if (!DAG.MaskedValueIsZero(N10,
6617 APInt::getBitsSet(OpSizeInBits, 16, HighBit)))
6622 SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
6623 if (OpSizeInBits > 16) {
6625 Res = DAG.getNode(ISD::SRL, DL, VT, Res,
6626 DAG.getConstant(OpSizeInBits - 16, DL,
6627 getShiftAmountTy(VT)));
6632 /// Return true if the specified node is an element that makes up a 32-bit
6633 /// packed halfword byteswap.
6634 /// ((x & 0x000000ff) << 8) |
6635 /// ((x & 0x0000ff00) >> 8) |
6636 /// ((x & 0x00ff0000) << 8) |
6637 /// ((x & 0xff000000) >> 8)
6638 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
6639 if (!N->hasOneUse())
6642 unsigned Opc = N.getOpcode();
6643 if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
6646 SDValue N0 = N.getOperand(0);
6647 unsigned Opc0 = N0.getOpcode();
6648 if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
6651 ConstantSDNode *N1C = nullptr;
6652 // SHL or SRL: look upstream for AND mask operand
6653 if (Opc == ISD::AND)
6654 N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
6655 else if (Opc0 == ISD::AND)
6656 N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6660 unsigned MaskByteOffset;
6661 switch (N1C->getZExtValue()) {
6664 case 0xFF: MaskByteOffset = 0; break;
6665 case 0xFF00: MaskByteOffset = 1; break;
6667 // In case demanded bits didn't clear the bits that will be shifted out.
6668 // This is needed for X86.
6669 if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) {
6674 case 0xFF0000: MaskByteOffset = 2; break;
6675 case 0xFF000000: MaskByteOffset = 3; break;
6678 // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
6679 if (Opc == ISD::AND) {
6680 if (MaskByteOffset == 0 || MaskByteOffset == 2) {
6682 // (x >> 8) & 0xff0000
6683 if (Opc0 != ISD::SRL)
6685 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6686 if (!C || C->getZExtValue() != 8)
6689 // (x << 8) & 0xff00
6690 // (x << 8) & 0xff000000
6691 if (Opc0 != ISD::SHL)
6693 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6694 if (!C || C->getZExtValue() != 8)
6697 } else if (Opc == ISD::SHL) {
6699 // (x & 0xff0000) << 8
6700 if (MaskByteOffset != 0 && MaskByteOffset != 2)
6702 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
6703 if (!C || C->getZExtValue() != 8)
6705 } else { // Opc == ISD::SRL
6706 // (x & 0xff00) >> 8
6707 // (x & 0xff000000) >> 8
6708 if (MaskByteOffset != 1 && MaskByteOffset != 3)
6710 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
6711 if (!C || C->getZExtValue() != 8)
6715 if (Parts[MaskByteOffset])
6718 Parts[MaskByteOffset] = N0.getOperand(0).getNode();
6722 // Match 2 elements of a packed halfword bswap.
6723 static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
6724 if (N.getOpcode() == ISD::OR)
6725 return isBSwapHWordElement(N.getOperand(0), Parts) &&
6726 isBSwapHWordElement(N.getOperand(1), Parts);
6728 if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) {
6729 ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1));
6730 if (!C || C->getAPIntValue() != 16)
6732 Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode();
6739 // Match this pattern:
6740 // (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff))
6741 // And rewrite this to:
6742 // (rotr (bswap A), 16)
6743 static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
6744 SelectionDAG &DAG, SDNode *N, SDValue N0,
6745 SDValue N1, EVT VT, EVT ShiftAmountTy) {
6746 assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&
6747 "MatchBSwapHWordOrAndAnd: expecting i32");
6748 if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
6750 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
6752 // TODO: this is too restrictive; lifting this restriction requires more tests
6753 if (!N0->hasOneUse() || !N1->hasOneUse())
6755 ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));
6756 ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));
6757 if (!Mask0 || !Mask1)
6759 if (Mask0->getAPIntValue() != 0xff00ff00 ||
6760 Mask1->getAPIntValue() != 0x00ff00ff)
6762 SDValue Shift0 = N0.getOperand(0);
6763 SDValue Shift1 = N1.getOperand(0);
6764 if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL)
6766 ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1));
6767 ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1));
6768 if (!ShiftAmt0 || !ShiftAmt1)
6770 if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8)
6772 if (Shift0.getOperand(0) != Shift1.getOperand(0))
6776 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));
6777 SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);
6778 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
6781 /// Match a 32-bit packed halfword bswap. That is
6782 /// ((x & 0x000000ff) << 8) |
6783 /// ((x & 0x0000ff00) >> 8) |
6784 /// ((x & 0x00ff0000) << 8) |
6785 /// ((x & 0xff000000) >> 8)
6786 /// => (rotl (bswap x), 16)
6787 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
6788 if (!LegalOperations)
6791 EVT VT = N->getValueType(0);
6794 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
6797 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,
6798 getShiftAmountTy(VT)))
6801 // Try again with commuted operands.
6802 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,
6803 getShiftAmountTy(VT)))
6808 // (or (bswaphpair), (bswaphpair))
6809 // (or (or (bswaphpair), (and)), (and))
6810 // (or (or (and), (bswaphpair)), (and))
6811 SDNode *Parts[4] = {};
6813 if (isBSwapHWordPair(N0, Parts)) {
6814 // (or (or (and), (and)), (or (and), (and)))
6815 if (!isBSwapHWordPair(N1, Parts))
6817 } else if (N0.getOpcode() == ISD::OR) {
6818 // (or (or (or (and), (and)), (and)), (and))
6819 if (!isBSwapHWordElement(N1, Parts))
6821 SDValue N00 = N0.getOperand(0);
6822 SDValue N01 = N0.getOperand(1);
6823 if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
6824 !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
6830 // Make sure the parts are all coming from the same node.
6831 if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
6835 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
6836 SDValue(Parts[0], 0));
6838 // Result of the bswap should be rotated by 16. If it's not legal, then
6839 // do (x << 16) | (x >> 16).
6840 SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
6841 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
6842 return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
6843 if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
6844 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
6845 return DAG.getNode(ISD::OR, DL, VT,
6846 DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
6847 DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
6850 /// This contains all DAGCombine rules which reduce two values combined by
6851 /// an Or operation to a single value \see visitANDLike().
6852 SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
6853 EVT VT = N1.getValueType();
6856 // fold (or x, undef) -> -1
6857 if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
6858 return DAG.getAllOnesConstant(DL, VT);
6860 if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
6863 // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible.
6864 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
6865 // Don't increase # computations.
6866 (N0->hasOneUse() || N1->hasOneUse())) {
6867 // We can only do this xform if we know that bits from X that are set in C2
6868 // but not in C1 are already zero. Likewise for Y.
6869 if (const ConstantSDNode *N0O1C =
6870 getAsNonOpaqueConstant(N0.getOperand(1))) {
6871 if (const ConstantSDNode *N1O1C =
6872 getAsNonOpaqueConstant(N1.getOperand(1))) {
6873 // We can only do this xform if we know that bits from X that are set in
6874 // C2 but not in C1 are already zero. Likewise for Y.
6875 const APInt &LHSMask = N0O1C->getAPIntValue();
6876 const APInt &RHSMask = N1O1C->getAPIntValue();
6878 if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
6879 DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
6880 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
6881 N0.getOperand(0), N1.getOperand(0));
6882 return DAG.getNode(ISD::AND, DL, VT, X,
6883 DAG.getConstant(LHSMask | RHSMask, DL, VT));
6889 // (or (and X, M), (and X, N)) -> (and X, (or M, N))
6890 if (N0.getOpcode() == ISD::AND &&
6891 N1.getOpcode() == ISD::AND &&
6892 N0.getOperand(0) == N1.getOperand(0) &&
6893 // Don't increase # computations.
6894 (N0->hasOneUse() || N1->hasOneUse())) {
6895 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
6896 N0.getOperand(1), N1.getOperand(1));
6897 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
6903 /// OR combines for which the commuted variant will be tried as well.
6904 static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
6906 EVT VT = N0.getValueType();
6907 if (N0.getOpcode() == ISD::AND) {
6908 SDValue N00 = N0.getOperand(0);
6909 SDValue N01 = N0.getOperand(1);
6911 // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y)
6912 // TODO: Set AllowUndefs = true.
6913 if (getBitwiseNotOperand(N01, N00,
6914 /* AllowUndefs */ false) == N1)
6915 return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N1);
6917 // fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
6918 if (getBitwiseNotOperand(N00, N01,
6919 /* AllowUndefs */ false) == N1)
6920 return DAG.getNode(ISD::OR, SDLoc(N), VT, N01, N1);
6923 if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
6926 auto peekThroughZext = [](SDValue V) {
6927 if (V->getOpcode() == ISD::ZERO_EXTEND)
6928 return V->getOperand(0);
6932 // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
6933 if (N0.getOpcode() == ISD::FSHL && N1.getOpcode() == ISD::SHL &&
6934 N0.getOperand(0) == N1.getOperand(0) &&
6935 peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
6938 // (fshr ?, X, Y) | (srl X, Y) --> fshr ?, X, Y
6939 if (N0.getOpcode() == ISD::FSHR && N1.getOpcode() == ISD::SRL &&
6940 N0.getOperand(1) == N1.getOperand(0) &&
6941 peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
6947 SDValue DAGCombiner::visitOR(SDNode *N) {
6948 SDValue N0 = N->getOperand(0);
6949 SDValue N1 = N->getOperand(1);
6950 EVT VT = N1.getValueType();
6956 // fold (or c1, c2) -> c1|c2
6957 if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
6960 // canonicalize constant to RHS
6961 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
6962 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
6963 return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
6966 if (VT.isVector()) {
6967 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
6970 // fold (or x, 0) -> x, vector edition
6971 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
6974 // fold (or x, -1) -> -1, vector edition
6975 if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
6976 // do not return N1, because undef node may exist in N1
6977 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
6979 // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
6980 // Do this only if the resulting type / shuffle is legal.
6981 auto *SV0 = dyn_cast<ShuffleVectorSDNode>(N0);
6982 auto *SV1 = dyn_cast<ShuffleVectorSDNode>(N1);
6983 if (SV0 && SV1 && TLI.isTypeLegal(VT)) {
6984 bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
6985 bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
6986 bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
6987 bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
6988 // Ensure both shuffles have a zero input.
6989 if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
6990 assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
6991 assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
6992 bool CanFold = true;
6993 int NumElts = VT.getVectorNumElements();
6994 SmallVector<int, 4> Mask(NumElts, -1);
6996 for (int i = 0; i != NumElts; ++i) {
6997 int M0 = SV0->getMaskElt(i);
6998 int M1 = SV1->getMaskElt(i);
7000 // Determine if either index is pointing to a zero vector.
7001 bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts));
7002 bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts));
7004 // If one element is zero and the otherside is undef, keep undef.
7005 // This also handles the case that both are undef.
7006 if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0))
7009 // Make sure only one of the elements is zero.
7010 if (M0Zero == M1Zero) {
7015 assert((M0 >= 0 || M1 >= 0) && "Undef index!");
7017 // We have a zero and non-zero element. If the non-zero came from
7018 // SV0 make the index a LHS index. If it came from SV1, make it
7019 // a RHS index. We need to mod by NumElts because we don't care
7020 // which operand it came from in the original shuffles.
7021 Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
7025 SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
7026 SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
7028 SDValue LegalShuffle =
7029 TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS,
7032 return LegalShuffle;
7038 // fold (or x, 0) -> x
7039 if (isNullConstant(N1))
7042 // fold (or x, -1) -> -1
7043 if (isAllOnesConstant(N1))
7046 if (SDValue NewSel = foldBinOpIntoSelect(N))
7049 // fold (or x, c) -> c iff (x & ~c) == 0
7050 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
7051 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
7054 if (SDValue Combined = visitORLike(N0, N1, N))
7057 if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
7060 // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
7061 if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
7063 if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
7067 if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
7070 // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
7071 // iff (c1 & c2) != 0 or c1/c2 are undef.
7072 auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
7073 return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue());
7075 if (N0.getOpcode() == ISD::AND && N0->hasOneUse() &&
7076 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
7077 if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
7078 {N1, N0.getOperand(1)})) {
7079 SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
7080 AddToWorklist(IOR.getNode());
7081 return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
7085 if (SDValue Combined = visitORCommutative(DAG, N0, N1, N))
7087 if (SDValue Combined = visitORCommutative(DAG, N1, N0, N))
7090 // Simplify: (or (op x...), (op y...)) -> (op (or x, y))
7091 if (N0.getOpcode() == N1.getOpcode())
7092 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
7095 // See if this is some rotate idiom.
7096 if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N)))
7099 if (SDValue Load = MatchLoadCombine(N))
7102 // Simplify the operands using demanded-bits information.
7103 if (SimplifyDemandedBits(SDValue(N, 0)))
7104 return SDValue(N, 0);
7106 // If OR can be rewritten into ADD, try combines based on ADD.
7107 if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
7108 DAG.haveNoCommonBitsSet(N0, N1))
7109 if (SDValue Combined = visitADDLike(N))
7115 static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) {
7116 if (Op.getOpcode() == ISD::AND &&
7117 DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
7118 Mask = Op.getOperand(1);
7119 return Op.getOperand(0);
7124 /// Match "(X shl/srl V1) & V2" where V2 may not be present.
7125 static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
7127 Op = stripConstantMask(DAG, Op, Mask);
7128 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
7135 /// Helper function for visitOR to extract the needed side of a rotate idiom
7136 /// from a shl/srl/mul/udiv. This is meant to handle cases where
7137 /// InstCombine merged some outside op with one of the shifts from
7138 /// the rotate pattern.
7139 /// \returns An empty \c SDValue if the needed shift couldn't be extracted.
7140 /// Otherwise, returns an expansion of \p ExtractFrom based on the following
7143 /// (or (add v v) (shrl v bitwidth-1)):
7144 /// expands (add v v) -> (shl v 1)
7146 /// (or (mul v c0) (shrl (mul v c1) c2)):
7147 /// expands (mul v c0) -> (shl (mul v c1) c3)
7149 /// (or (udiv v c0) (shl (udiv v c1) c2)):
7150 /// expands (udiv v c0) -> (shrl (udiv v c1) c3)
7152 /// (or (shl v c0) (shrl (shl v c1) c2)):
7153 /// expands (shl v c0) -> (shl (shl v c1) c3)
7155 /// (or (shrl v c0) (shl (shrl v c1) c2)):
7156 /// expands (shrl v c0) -> (shrl (shrl v c1) c3)
7158 /// Such that in all cases, c3+c2==bitwidth(op v c1).
7159 static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
7160 SDValue ExtractFrom, SDValue &Mask,
7162 assert(OppShift && ExtractFrom && "Empty SDValue");
7164 (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) &&
7165 "Existing shift must be valid as a rotate half");
7167 ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
7169 // Value and Type of the shift.
7170 SDValue OppShiftLHS = OppShift.getOperand(0);
7171 EVT ShiftedVT = OppShiftLHS.getValueType();
7173 // Amount of the existing shift.
7174 ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
7176 // (add v v) -> (shl v 1)
7177 // TODO: Should this be a general DAG canonicalization?
7178 if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
7179 ExtractFrom.getOpcode() == ISD::ADD &&
7180 ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
7181 ExtractFrom.getOperand(0) == OppShiftLHS &&
7182 OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1)
7183 return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS,
7184 DAG.getShiftAmountConstant(1, ShiftedVT, DL));
7187 // (or (op0 v c0) (shiftl/r (op0 v c1) c2))
7189 // Find opcode of the needed shift to be extracted from (op0 v c0).
7190 unsigned Opcode = ISD::DELETED_NODE;
7191 bool IsMulOrDiv = false;
7192 // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift
7193 // opcode or its arithmetic (mul or udiv) variant.
7194 auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) {
7195 IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant;
7196 if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift)
7198 Opcode = NeededShift;
7201 // op0 must be either the needed shift opcode or the mul/udiv equivalent
7202 // that the needed shift can be extracted from.
7203 if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) &&
7204 (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV)))
7207 // op0 must be the same opcode on both sides, have the same LHS argument,
7208 // and produce the same value type.
7209 if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
7210 OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
7211 ShiftedVT != ExtractFrom.getValueType())
7214 // Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
7215 ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
7216 // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
7217 ConstantSDNode *ExtractFromCst =
7218 isConstOrConstSplat(ExtractFrom.getOperand(1));
7219 // TODO: We should be able to handle non-uniform constant vectors for these values
7220 // Check that we have constant values.
7221 if (!OppShiftCst || !OppShiftCst->getAPIntValue() ||
7222 !OppLHSCst || !OppLHSCst->getAPIntValue() ||
7223 !ExtractFromCst || !ExtractFromCst->getAPIntValue())
7226 // Compute the shift amount we need to extract to complete the rotate.
7227 const unsigned VTWidth = ShiftedVT.getScalarSizeInBits();
7228 if (OppShiftCst->getAPIntValue().ugt(VTWidth))
7230 APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue();
7231 // Normalize the bitwidth of the two mul/udiv/shift constant operands.
7232 APInt ExtractFromAmt = ExtractFromCst->getAPIntValue();
7233 APInt OppLHSAmt = OppLHSCst->getAPIntValue();
7234 zeroExtendToMatch(ExtractFromAmt, OppLHSAmt);
7236 // Now try extract the needed shift from the ExtractFrom op and see if the
7237 // result matches up with the existing shift's LHS op.
7239 // Op to extract from is a mul or udiv by a constant.
7241 // c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0
7242 // c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0
7243 const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(),
7244 NeededShiftAmt.getZExtValue());
7247 APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem);
7248 if (Rem != 0 || ResultAmt != OppLHSAmt)
7251 // Op to extract from is a shift by a constant.
7253 // c2 - (bitwidth(op0 v c0) - c1) == c0
7254 if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc(
7255 ExtractFromAmt.getBitWidth()))
7259 // Return the expanded shift op that should allow a rotate to be formed.
7260 EVT ShiftVT = OppShift.getOperand(1).getValueType();
7261 EVT ResVT = ExtractFrom.getValueType();
7262 SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT);
7263 return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode);
7266 // Return true if we can prove that, whenever Neg and Pos are both in the
7267 // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
7268 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
7270 // (or (shift1 X, Neg), (shift2 X, Pos))
7272 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
7273 // in direction shift1 by Neg. The range [0, EltSize) means that we only need
7274 // to consider shift amounts with defined behavior.
7276 // The IsRotate flag should be set when the LHS of both shifts is the same.
7277 // Otherwise if matching a general funnel shift, it should be clear.
7278 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
7279 SelectionDAG &DAG, bool IsRotate) {
7280 const auto &TLI = DAG.getTargetLoweringInfo();
7281 // If EltSize is a power of 2 then:
7283 // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
7284 // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
7286 // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
7287 // for the stronger condition:
7289 // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
7291 // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
7292 // we can just replace Neg with Neg' for the rest of the function.
7294 // In other cases we check for the even stronger condition:
7296 // Neg == EltSize - Pos [B]
7298 // for all Neg and Pos. Note that the (or ...) then invokes undefined
7299 // behavior if Pos == 0 (and consequently Neg == EltSize).
7301 // We could actually use [A] whenever EltSize is a power of 2, but the
7302 // only extra cases that it would match are those uninteresting ones
7303 // where Neg and Pos are never in range at the same time. E.g. for
7304 // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
7305 // as well as (sub 32, Pos), but:
7307 // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
7309 // always invokes undefined behavior for 32-bit X.
7311 // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
7312 // This allows us to peek through any operations that only affect Mask's
7313 // un-demanded bits.
7315 // NOTE: We can only do this when matching operations which won't modify the
7316 // least Log2(EltSize) significant bits and not a general funnel shift.
7317 unsigned MaskLoBits = 0;
7318 if (IsRotate && isPowerOf2_64(EltSize)) {
7319 unsigned Bits = Log2_64(EltSize);
7320 unsigned NegBits = Neg.getScalarValueSizeInBits();
7321 if (NegBits >= Bits) {
7322 APInt DemandedBits = APInt::getLowBitsSet(NegBits, Bits);
7324 TLI.SimplifyMultipleUseDemandedBits(Neg, DemandedBits, DAG)) {
7331 // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
7332 if (Neg.getOpcode() != ISD::SUB)
7334 ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
7337 SDValue NegOp1 = Neg.getOperand(1);
7339 // On the RHS of [A], if Pos is the result of operation on Pos' that won't
7340 // affect Mask's demanded bits, just replace Pos with Pos'. These operations
7341 // are redundant for the purpose of the equality.
7343 unsigned PosBits = Pos.getScalarValueSizeInBits();
7344 if (PosBits >= MaskLoBits) {
7345 APInt DemandedBits = APInt::getLowBitsSet(PosBits, MaskLoBits);
7347 TLI.SimplifyMultipleUseDemandedBits(Pos, DemandedBits, DAG)) {
7353 // The condition we need is now:
7355 // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
7357 // If NegOp1 == Pos then we need:
7359 // EltSize & Mask == NegC & Mask
7361 // (because "x & Mask" is a truncation and distributes through subtraction).
7363 // We also need to account for a potential truncation of NegOp1 if the amount
7364 // has already been legalized to a shift amount type.
7366 if ((Pos == NegOp1) ||
7367 (NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0)))
7368 Width = NegC->getAPIntValue();
7370 // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
7371 // Then the condition we want to prove becomes:
7373 // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
7375 // which, again because "x & Mask" is a truncation, becomes:
7377 // NegC & Mask == (EltSize - PosC) & Mask
7378 // EltSize & Mask == (NegC + PosC) & Mask
7379 else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
7380 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
7381 Width = PosC->getAPIntValue() + NegC->getAPIntValue();
7387 // Now we just need to check that EltSize & Mask == Width & Mask.
7389 // EltSize & Mask is 0 since Mask is EltSize - 1.
7390 return Width.getLoBits(MaskLoBits) == 0;
7391 return Width == EltSize;
7394 // A subroutine of MatchRotate used once we have found an OR of two opposite
7395 // shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces
7396 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
7397 // former being preferred if supported. InnerPos and InnerNeg are Pos and
7398 // Neg with outer conversions stripped away.
7399 SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
7400 SDValue Neg, SDValue InnerPos,
7401 SDValue InnerNeg, bool HasPos,
7402 unsigned PosOpcode, unsigned NegOpcode,
7404 // fold (or (shl x, (*ext y)),
7405 // (srl x, (*ext (sub 32, y)))) ->
7406 // (rotl x, y) or (rotr x, (sub 32, y))
7408 // fold (or (shl x, (*ext (sub 32, y))),
7409 // (srl x, (*ext y))) ->
7410 // (rotr x, y) or (rotl x, (sub 32, y))
7411 EVT VT = Shifted.getValueType();
7412 if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
7413 /*IsRotate*/ true)) {
7414 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
7415 HasPos ? Pos : Neg);
7421 // A subroutine of MatchRotate used once we have found an OR of two opposite
7422 // shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces
7423 // to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
7424 // former being preferred if supported. InnerPos and InnerNeg are Pos and
7425 // Neg with outer conversions stripped away.
7426 // TODO: Merge with MatchRotatePosNeg.
7427 SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
7428 SDValue Neg, SDValue InnerPos,
7429 SDValue InnerNeg, bool HasPos,
7430 unsigned PosOpcode, unsigned NegOpcode,
7432 EVT VT = N0.getValueType();
7433 unsigned EltBits = VT.getScalarSizeInBits();
7435 // fold (or (shl x0, (*ext y)),
7436 // (srl x1, (*ext (sub 32, y)))) ->
7437 // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
7439 // fold (or (shl x0, (*ext (sub 32, y))),
7440 // (srl x1, (*ext y))) ->
7441 // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
7442 if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
7443 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
7444 HasPos ? Pos : Neg);
7447 // Matching the shift+xor cases, we can't easily use the xor'd shift amount
7448 // so for now just use the PosOpcode case if its legal.
7449 // TODO: When can we use the NegOpcode case?
7450 if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
7451 auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {
7452 if (Op.getOpcode() != BinOpc)
7454 ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));
7455 return Cst && (Cst->getAPIntValue() == Imm);
7458 // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
7459 // -> (fshl x0, x1, y)
7460 if (IsBinOpImm(N1, ISD::SRL, 1) &&
7461 IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
7462 InnerPos == InnerNeg.getOperand(0) &&
7463 TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {
7464 return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
7467 // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
7468 // -> (fshr x0, x1, y)
7469 if (IsBinOpImm(N0, ISD::SHL, 1) &&
7470 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
7471 InnerNeg == InnerPos.getOperand(0) &&
7472 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
7473 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
7476 // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
7477 // -> (fshr x0, x1, y)
7478 // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
7479 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
7480 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
7481 InnerNeg == InnerPos.getOperand(0) &&
7482 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
7483 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
7490 // MatchRotate - Handle an 'or' of two operands. If this is one of the many
7491 // idioms for rotate, and if the target supports rotation instructions, generate
7492 // a rot[lr]. This also matches funnel shift patterns, similar to rotation but
7493 // with different shifted sources.
7494 SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
7495 EVT VT = LHS.getValueType();
7497 // The target must have at least one rotate/funnel flavor.
7498 // We still try to match rotate by constant pre-legalization.
7499 // TODO: Support pre-legalization funnel-shift by constant.
7500 bool HasROTL = hasOperation(ISD::ROTL, VT);
7501 bool HasROTR = hasOperation(ISD::ROTR, VT);
7502 bool HasFSHL = hasOperation(ISD::FSHL, VT);
7503 bool HasFSHR = hasOperation(ISD::FSHR, VT);
7505 // If the type is going to be promoted and the target has enabled custom
7506 // lowering for rotate, allow matching rotate by non-constants. Only allow
7507 // this for scalar types.
7508 if (VT.isScalarInteger() && TLI.getTypeAction(*DAG.getContext(), VT) ==
7509 TargetLowering::TypePromoteInteger) {
7510 HasROTL |= TLI.getOperationAction(ISD::ROTL, VT) == TargetLowering::Custom;
7511 HasROTR |= TLI.getOperationAction(ISD::ROTR, VT) == TargetLowering::Custom;
7514 if (LegalOperations && !HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
7517 // Check for truncated rotate.
7518 if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
7519 LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
7520 assert(LHS.getValueType() == RHS.getValueType());
7521 if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
7522 return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
7526 // Match "(X shl/srl V1) & V2" where V2 may not be present.
7527 SDValue LHSShift; // The shift.
7528 SDValue LHSMask; // AND value if any.
7529 matchRotateHalf(DAG, LHS, LHSShift, LHSMask);
7531 SDValue RHSShift; // The shift.
7532 SDValue RHSMask; // AND value if any.
7533 matchRotateHalf(DAG, RHS, RHSShift, RHSMask);
7535 // If neither side matched a rotate half, bail
7536 if (!LHSShift && !RHSShift)
7539 // InstCombine may have combined a constant shl, srl, mul, or udiv with one
7540 // side of the rotate, so try to handle that here. In all cases we need to
7541 // pass the matched shift from the opposite side to compute the opcode and
7542 // needed shift amount to extract. We still want to do this if both sides
7543 // matched a rotate half because one half may be a potential overshift that
7544 // can be broken down (ie if InstCombine merged two shl or srl ops into a
7547 // Have LHS side of the rotate, try to extract the needed shift from the RHS.
7549 if (SDValue NewRHSShift =
7550 extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL))
7551 RHSShift = NewRHSShift;
7552 // Have RHS side of the rotate, try to extract the needed shift from the LHS.
7554 if (SDValue NewLHSShift =
7555 extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL))
7556 LHSShift = NewLHSShift;
7558 // If a side is still missing, nothing else we can do.
7559 if (!RHSShift || !LHSShift)
7562 // At this point we've matched or extracted a shift op on each side.
7564 if (LHSShift.getOpcode() == RHSShift.getOpcode())
7565 return SDValue(); // Shifts must disagree.
7567 // Canonicalize shl to left side in a shl/srl pair.
7568 if (RHSShift.getOpcode() == ISD::SHL) {
7569 std::swap(LHS, RHS);
7570 std::swap(LHSShift, RHSShift);
7571 std::swap(LHSMask, RHSMask);
7574 unsigned EltSizeInBits = VT.getScalarSizeInBits();
7575 SDValue LHSShiftArg = LHSShift.getOperand(0);
7576 SDValue LHSShiftAmt = LHSShift.getOperand(1);
7577 SDValue RHSShiftArg = RHSShift.getOperand(0);
7578 SDValue RHSShiftAmt = RHSShift.getOperand(1);
7580 auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
7581 ConstantSDNode *RHS) {
7582 return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
7585 auto ApplyMasks = [&](SDValue Res) {
7586 // If there is an AND of either shifted operand, apply it to the result.
7587 if (LHSMask.getNode() || RHSMask.getNode()) {
7588 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
7589 SDValue Mask = AllOnes;
7591 if (LHSMask.getNode()) {
7592 SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
7593 Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
7594 DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
7596 if (RHSMask.getNode()) {
7597 SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
7598 Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
7599 DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
7602 Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask);
7608 // TODO: Support pre-legalization funnel-shift by constant.
7609 bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
7610 if (!IsRotate && !(HasFSHL || HasFSHR)) {
7611 if (TLI.isTypeLegal(VT) && LHS.hasOneUse() && RHS.hasOneUse() &&
7612 ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
7613 // Look for a disguised rotate by constant.
7614 // The common shifted operand X may be hidden inside another 'or'.
7616 auto matchOr = [&X, &Y](SDValue Or, SDValue CommonOp) {
7617 if (!Or.hasOneUse() || Or.getOpcode() != ISD::OR)
7619 if (CommonOp == Or.getOperand(0)) {
7621 Y = Or.getOperand(1);
7624 if (CommonOp == Or.getOperand(1)) {
7626 Y = Or.getOperand(0);
7633 if (matchOr(LHSShiftArg, RHSShiftArg)) {
7634 // (shl (X | Y), C1) | (srl X, C2) --> (rotl X, C1) | (shl Y, C1)
7635 SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
7636 SDValue ShlY = DAG.getNode(ISD::SHL, DL, VT, Y, LHSShiftAmt);
7637 Res = DAG.getNode(ISD::OR, DL, VT, RotX, ShlY);
7638 } else if (matchOr(RHSShiftArg, LHSShiftArg)) {
7639 // (shl X, C1) | (srl (X | Y), C2) --> (rotl X, C1) | (srl Y, C2)
7640 SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
7641 SDValue SrlY = DAG.getNode(ISD::SRL, DL, VT, Y, RHSShiftAmt);
7642 Res = DAG.getNode(ISD::OR, DL, VT, RotX, SrlY);
7647 return ApplyMasks(Res);
7650 return SDValue(); // Requires funnel shift support.
7653 // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
7654 // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
7655 // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
7656 // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
7657 // iff C1+C2 == EltSizeInBits
7658 if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
7660 if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) {
7661 bool UseROTL = !LegalOperations || HasROTL;
7662 Res = DAG.getNode(UseROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
7663 UseROTL ? LHSShiftAmt : RHSShiftAmt);
7665 bool UseFSHL = !LegalOperations || HasFSHL;
7666 Res = DAG.getNode(UseFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
7667 RHSShiftArg, UseFSHL ? LHSShiftAmt : RHSShiftAmt);
7670 return ApplyMasks(Res);
7673 // Even pre-legalization, we can't easily rotate/funnel-shift by a variable
7675 if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
7678 // If there is a mask here, and we have a variable shift, we can't be sure
7679 // that we're masking out the right stuff.
7680 if (LHSMask.getNode() || RHSMask.getNode())
7683 // If the shift amount is sign/zext/any-extended just peel it off.
7684 SDValue LExtOp0 = LHSShiftAmt;
7685 SDValue RExtOp0 = RHSShiftAmt;
7686 if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
7687 LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
7688 LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
7689 LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
7690 (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
7691 RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
7692 RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
7693 RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
7694 LExtOp0 = LHSShiftAmt.getOperand(0);
7695 RExtOp0 = RHSShiftAmt.getOperand(0);
7698 if (IsRotate && (HasROTL || HasROTR)) {
7700 MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
7701 RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL);
7706 MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
7707 LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL);
7713 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
7714 LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL);
7719 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
7720 RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL);
7729 /// Represents known origin of an individual byte in load combine pattern. The
7730 /// value of the byte is either constant zero or comes from memory.
7731 struct ByteProvider {
7732 // For constant zero providers Load is set to nullptr. For memory providers
7733 // Load represents the node which loads the byte from memory.
7734 // ByteOffset is the offset of the byte in the value produced by the load.
7735 LoadSDNode *Load = nullptr;
7736 unsigned ByteOffset = 0;
7738 ByteProvider() = default;
7740 static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
7741 return ByteProvider(Load, ByteOffset);
7744 static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
7746 bool isConstantZero() const { return !Load; }
7747 bool isMemory() const { return Load; }
7749 bool operator==(const ByteProvider &Other) const {
7750 return Other.Load == Load && Other.ByteOffset == ByteOffset;
7754 ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
7755 : Load(Load), ByteOffset(ByteOffset) {}
7758 } // end anonymous namespace
7760 /// Recursively traverses the expression calculating the origin of the requested
7761 /// byte of the given value. Returns None if the provider can't be calculated.
7763 /// For all the values except the root of the expression verifies that the value
7764 /// has exactly one use and if it's not true return None. This way if the origin
7765 /// of the byte is returned it's guaranteed that the values which contribute to
7766 /// the byte are not used outside of this expression.
7768 /// Because the parts of the expression are not allowed to have more than one
7769 /// use this function iterates over trees, not DAGs. So it never visits the same
7770 /// node more than once.
7771 static const Optional<ByteProvider>
7772 calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
7773 bool Root = false) {
7774 // Typical i64 by i8 pattern requires recursion up to 8 calls depth
7778 if (!Root && !Op.hasOneUse())
7781 assert(Op.getValueType().isScalarInteger() && "can't handle other types");
7782 unsigned BitWidth = Op.getValueSizeInBits();
7783 if (BitWidth % 8 != 0)
7785 unsigned ByteWidth = BitWidth / 8;
7786 assert(Index < ByteWidth && "invalid index requested");
7789 switch (Op.getOpcode()) {
7791 auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
7794 auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
7798 if (LHS->isConstantZero())
7800 if (RHS->isConstantZero())
7805 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
7809 uint64_t BitShift = ShiftOp->getZExtValue();
7810 if (BitShift % 8 != 0)
7812 uint64_t ByteShift = BitShift / 8;
7814 return Index < ByteShift
7815 ? ByteProvider::getConstantZero()
7816 : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
7819 case ISD::ANY_EXTEND:
7820 case ISD::SIGN_EXTEND:
7821 case ISD::ZERO_EXTEND: {
7822 SDValue NarrowOp = Op->getOperand(0);
7823 unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
7824 if (NarrowBitWidth % 8 != 0)
7826 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
7828 if (Index >= NarrowByteWidth)
7829 return Op.getOpcode() == ISD::ZERO_EXTEND
7830 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
7832 return calculateByteProvider(NarrowOp, Index, Depth + 1);
7835 return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
7838 auto L = cast<LoadSDNode>(Op.getNode());
7839 if (!L->isSimple() || L->isIndexed())
7842 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
7843 if (NarrowBitWidth % 8 != 0)
7845 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
7847 if (Index >= NarrowByteWidth)
7848 return L->getExtensionType() == ISD::ZEXTLOAD
7849 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
7851 return ByteProvider::getMemory(L, Index);
7858 static unsigned littleEndianByteAt(unsigned BW, unsigned i) {
7862 static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
7866 // Check if the bytes offsets we are looking at match with either big or
7867 // little endian value loaded. Return true for big endian, false for little
7868 // endian, and None if match failed.
7869 static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
7870 int64_t FirstOffset) {
7871 // The endian can be decided only when it is 2 bytes at least.
7872 unsigned Width = ByteOffsets.size();
7876 bool BigEndian = true, LittleEndian = true;
7877 for (unsigned i = 0; i < Width; i++) {
7878 int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
7879 LittleEndian &= CurrentByteOffset == littleEndianByteAt(Width, i);
7880 BigEndian &= CurrentByteOffset == bigEndianByteAt(Width, i);
7881 if (!BigEndian && !LittleEndian)
7885 assert((BigEndian != LittleEndian) && "It should be either big endian or"
7890 static SDValue stripTruncAndExt(SDValue Value) {
7891 switch (Value.getOpcode()) {
7893 case ISD::ZERO_EXTEND:
7894 case ISD::SIGN_EXTEND:
7895 case ISD::ANY_EXTEND:
7896 return stripTruncAndExt(Value.getOperand(0));
7901 /// Match a pattern where a wide type scalar value is stored by several narrow
7902 /// stores. Fold it into a single store or a BSWAP and a store if the targets
7905 /// Assuming little endian target:
7908 /// p[0] = (val >> 0) & 0xFF;
7909 /// p[1] = (val >> 8) & 0xFF;
7910 /// p[2] = (val >> 16) & 0xFF;
7911 /// p[3] = (val >> 24) & 0xFF;
7913 /// *((i32)p) = val;
7917 /// p[0] = (val >> 24) & 0xFF;
7918 /// p[1] = (val >> 16) & 0xFF;
7919 /// p[2] = (val >> 8) & 0xFF;
7920 /// p[3] = (val >> 0) & 0xFF;
7922 /// *((i32)p) = BSWAP(val);
7923 SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
7924 // The matching looks for "store (trunc x)" patterns that appear early but are
7925 // likely to be replaced by truncating store nodes during combining.
7926 // TODO: If there is evidence that running this later would help, this
7927 // limitation could be removed. Legality checks may need to be added
7928 // for the created store and optional bswap/rotate.
7929 if (LegalOperations || OptLevel == CodeGenOpt::None)
7932 // We only handle merging simple stores of 1-4 bytes.
7933 // TODO: Allow unordered atomics when wider type is legal (see D66309)
7934 EVT MemVT = N->getMemoryVT();
7935 if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
7936 !N->isSimple() || N->isIndexed())
7939 // Collect all of the stores in the chain.
7940 SDValue Chain = N->getChain();
7941 SmallVector<StoreSDNode *, 8> Stores = {N};
7942 while (auto *Store = dyn_cast<StoreSDNode>(Chain)) {
7943 // All stores must be the same size to ensure that we are writing all of the
7944 // bytes in the wide value.
7945 // TODO: We could allow multiple sizes by tracking each stored byte.
7946 if (Store->getMemoryVT() != MemVT || !Store->isSimple() ||
7949 Stores.push_back(Store);
7950 Chain = Store->getChain();
7952 // There is no reason to continue if we do not have at least a pair of stores.
7953 if (Stores.size() < 2)
7956 // Handle simple types only.
7957 LLVMContext &Context = *DAG.getContext();
7958 unsigned NumStores = Stores.size();
7959 unsigned NarrowNumBits = N->getMemoryVT().getScalarSizeInBits();
7960 unsigned WideNumBits = NumStores * NarrowNumBits;
7961 EVT WideVT = EVT::getIntegerVT(Context, WideNumBits);
7962 if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64)
7965 // Check if all bytes of the source value that we are looking at are stored
7966 // to the same base address. Collect offsets from Base address into OffsetMap.
7967 SDValue SourceValue;
7968 SmallVector<int64_t, 8> OffsetMap(NumStores, INT64_MAX);
7969 int64_t FirstOffset = INT64_MAX;
7970 StoreSDNode *FirstStore = nullptr;
7971 Optional<BaseIndexOffset> Base;
7972 for (auto *Store : Stores) {
7973 // All the stores store different parts of the CombinedValue. A truncate is
7974 // required to get the partial value.
7975 SDValue Trunc = Store->getValue();
7976 if (Trunc.getOpcode() != ISD::TRUNCATE)
7978 // Other than the first/last part, a shift operation is required to get the
7981 SDValue WideVal = Trunc.getOperand(0);
7982 if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) &&
7983 isa<ConstantSDNode>(WideVal.getOperand(1))) {
7984 // The shift amount must be a constant multiple of the narrow type.
7985 // It is translated to the offset address in the wide source value "y".
7987 // x = srl y, ShiftAmtC
7990 uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1);
7991 if (ShiftAmtC % NarrowNumBits != 0)
7994 Offset = ShiftAmtC / NarrowNumBits;
7995 WideVal = WideVal.getOperand(0);
7998 // Stores must share the same source value with different offsets.
7999 // Truncate and extends should be stripped to get the single source value.
8001 SourceValue = WideVal;
8002 else if (stripTruncAndExt(SourceValue) != stripTruncAndExt(WideVal))
8004 else if (SourceValue.getValueType() != WideVT) {
8005 if (WideVal.getValueType() == WideVT ||
8006 WideVal.getScalarValueSizeInBits() >
8007 SourceValue.getScalarValueSizeInBits())
8008 SourceValue = WideVal;
8009 // Give up if the source value type is smaller than the store size.
8010 if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits())
8014 // Stores must share the same base address.
8015 BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG);
8016 int64_t ByteOffsetFromBase = 0;
8019 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
8022 // Remember the first store.
8023 if (ByteOffsetFromBase < FirstOffset) {
8025 FirstOffset = ByteOffsetFromBase;
8027 // Map the offset in the store and the offset in the combined value, and
8028 // early return if it has been set before.
8029 if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX)
8031 OffsetMap[Offset] = ByteOffsetFromBase;
8034 assert(FirstOffset != INT64_MAX && "First byte offset must be set");
8035 assert(FirstStore && "First store must be set");
8037 // Check that a store of the wide type is both allowed and fast on the target
8038 const DataLayout &Layout = DAG.getDataLayout();
8040 bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT,
8041 *FirstStore->getMemOperand(), &Fast);
8042 if (!Allowed || !Fast)
8045 // Check if the pieces of the value are going to the expected places in memory
8046 // to merge the stores.
8047 auto checkOffsets = [&](bool MatchLittleEndian) {
8048 if (MatchLittleEndian) {
8049 for (unsigned i = 0; i != NumStores; ++i)
8050 if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
8052 } else { // MatchBigEndian by reversing loop counter.
8053 for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
8054 if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
8060 // Check if the offsets line up for the native data layout of this target.
8061 bool NeedBswap = false;
8062 bool NeedRotate = false;
8063 if (!checkOffsets(Layout.isLittleEndian())) {
8064 // Special-case: check if byte offsets line up for the opposite endian.
8065 if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian()))
8067 else if (NumStores == 2 && checkOffsets(Layout.isBigEndian()))
8074 if (WideVT != SourceValue.getValueType()) {
8075 assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits &&
8076 "Unexpected store value to merge");
8077 SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue);
8080 // Before legalize we can introduce illegal bswaps/rotates which will be later
8081 // converted to an explicit bswap sequence. This way we end up with a single
8082 // store and byte shuffling instead of several stores and byte shuffling.
8084 SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
8085 } else if (NeedRotate) {
8086 assert(WideNumBits % 2 == 0 && "Unexpected type for rotate");
8087 SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT);
8088 SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt);
8092 DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(),
8093 FirstStore->getPointerInfo(), FirstStore->getAlign());
8095 // Rely on other DAG combine rules to remove the other individual stores.
8096 DAG.ReplaceAllUsesWith(N, NewStore.getNode());
8100 /// Match a pattern where a wide type scalar value is loaded by several narrow
8101 /// loads and combined by shifts and ors. Fold it into a single load or a load
8102 /// and a BSWAP if the targets supports it.
8104 /// Assuming little endian target:
8106 /// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
8108 /// i32 val = *((i32)a)
8111 /// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
8113 /// i32 val = BSWAP(*((i32)a))
8115 /// TODO: This rule matches complex patterns with OR node roots and doesn't
8116 /// interact well with the worklist mechanism. When a part of the pattern is
8117 /// updated (e.g. one of the loads) its direct users are put into the worklist,
8118 /// but the root node of the pattern which triggers the load combine is not
8119 /// necessarily a direct user of the changed node. For example, once the address
8120 /// of t28 load is reassociated load combine won't be triggered:
8121 /// t25: i32 = add t4, Constant:i32<2>
8122 /// t26: i64 = sign_extend t25
8123 /// t27: i64 = add t2, t26
8124 /// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
8125 /// t29: i32 = zero_extend t28
8126 /// t32: i32 = shl t29, Constant:i8<8>
8127 /// t33: i32 = or t23, t32
8128 /// As a possible fix visitLoad can check if the load can be a part of a load
8129 /// combine pattern and add corresponding OR roots to the worklist.
8130 SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
8131 assert(N->getOpcode() == ISD::OR &&
8132 "Can only match load combining against OR nodes");
8134 // Handles simple types only
8135 EVT VT = N->getValueType(0);
8136 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
8138 unsigned ByteWidth = VT.getSizeInBits() / 8;
8140 bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
8141 auto MemoryByteOffset = [&] (ByteProvider P) {
8142 assert(P.isMemory() && "Must be a memory byte provider");
8143 unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
8144 assert(LoadBitWidth % 8 == 0 &&
8145 "can only analyze providers for individual bytes not bit");
8146 unsigned LoadByteWidth = LoadBitWidth / 8;
8147 return IsBigEndianTarget
8148 ? bigEndianByteAt(LoadByteWidth, P.ByteOffset)
8149 : littleEndianByteAt(LoadByteWidth, P.ByteOffset);
8152 Optional<BaseIndexOffset> Base;
8155 SmallPtrSet<LoadSDNode *, 8> Loads;
8156 Optional<ByteProvider> FirstByteProvider;
8157 int64_t FirstOffset = INT64_MAX;
8159 // Check if all the bytes of the OR we are looking at are loaded from the same
8160 // base address. Collect bytes offsets from Base address in ByteOffsets.
8161 SmallVector<int64_t, 8> ByteOffsets(ByteWidth);
8162 unsigned ZeroExtendedBytes = 0;
8163 for (int i = ByteWidth - 1; i >= 0; --i) {
8164 auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true);
8168 if (P->isConstantZero()) {
8169 // It's OK for the N most significant bytes to be 0, we can just
8170 // zero-extend the load.
8171 if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i)))
8175 assert(P->isMemory() && "provenance should either be memory or zero");
8177 LoadSDNode *L = P->Load;
8178 assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
8180 "Must be enforced by calculateByteProvider");
8181 assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
8183 // All loads must share the same chain
8184 SDValue LChain = L->getChain();
8187 else if (Chain != LChain)
8190 // Loads must share the same base address
8191 BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
8192 int64_t ByteOffsetFromBase = 0;
8195 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
8198 // Calculate the offset of the current byte from the base address
8199 ByteOffsetFromBase += MemoryByteOffset(*P);
8200 ByteOffsets[i] = ByteOffsetFromBase;
8202 // Remember the first byte load
8203 if (ByteOffsetFromBase < FirstOffset) {
8204 FirstByteProvider = P;
8205 FirstOffset = ByteOffsetFromBase;
8210 assert(!Loads.empty() && "All the bytes of the value must be loaded from "
8211 "memory, so there must be at least one load which produces the value");
8212 assert(Base && "Base address of the accessed memory location must be set");
8213 assert(FirstOffset != INT64_MAX && "First byte offset must be set");
8215 bool NeedsZext = ZeroExtendedBytes > 0;
8218 EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8);
8220 if (!MemVT.isSimple())
8223 // Before legalize we can introduce too wide illegal loads which will be later
8224 // split into legal sized loads. This enables us to combine i64 load by i8
8225 // patterns to a couple of i32 loads on 32 bit targets.
8226 if (LegalOperations &&
8227 !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
8231 // Check if the bytes of the OR we are looking at match with either big or
8232 // little endian value load
8233 Optional<bool> IsBigEndian = isBigEndian(
8234 makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
8238 assert(FirstByteProvider && "must be set");
8240 // Ensure that the first byte is loaded from zero offset of the first load.
8241 // So the combined value can be loaded from the first load address.
8242 if (MemoryByteOffset(*FirstByteProvider) != 0)
8244 LoadSDNode *FirstLoad = FirstByteProvider->Load;
8246 // The node we are looking at matches with the pattern, check if we can
8247 // replace it with a single (possibly zero-extended) load and bswap + shift if
8250 // If the load needs byte swap check if the target supports it
8251 bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
8253 // Before legalize we can introduce illegal bswaps which will be later
8254 // converted to an explicit bswap sequence. This way we end up with a single
8255 // load and byte shuffling instead of several loads and byte shuffling.
8256 // We do not introduce illegal bswaps when zero-extending as this tends to
8257 // introduce too many arithmetic instructions.
8258 if (NeedsBswap && (LegalOperations || NeedsZext) &&
8259 !TLI.isOperationLegal(ISD::BSWAP, VT))
8262 // If we need to bswap and zero extend, we have to insert a shift. Check that
8264 if (NeedsBswap && NeedsZext && LegalOperations &&
8265 !TLI.isOperationLegal(ISD::SHL, VT))
8268 // Check that a load of the wide type is both allowed and fast on the target
8271 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
8272 *FirstLoad->getMemOperand(), &Fast);
8273 if (!Allowed || !Fast)
8277 DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT,
8278 Chain, FirstLoad->getBasePtr(),
8279 FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign());
8281 // Transfer chain users from old loads to the new load.
8282 for (LoadSDNode *L : Loads)
8283 DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
8288 SDValue ShiftedLoad =
8290 ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
8291 DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT,
8292 SDLoc(N), LegalOperations))
8294 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
8297 // If the target has andn, bsl, or a similar bit-select instruction,
8298 // we want to unfold masked merge, with canonical pattern of:
8300 // ((x ^ y) & m) ^ y
8303 // (x & m) | (y & ~m)
8304 // If y is a constant, m is not a 'not', and the 'andn' does not work with
8305 // immediates, we unfold into a different pattern:
8306 // ~(~x & m) & (m | y)
8307 // If x is a constant, m is a 'not', and the 'andn' does not work with
8308 // immediates, we unfold into a different pattern:
8309 // (x | ~m) & ~(~m & ~y)
8310 // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
8311 // the very least that breaks andnpd / andnps patterns, and because those
8312 // patterns are simplified in IR and shouldn't be created in the DAG
8313 SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
8314 assert(N->getOpcode() == ISD::XOR);
8316 // Don't touch 'not' (i.e. where y = -1).
8317 if (isAllOnesOrAllOnesSplat(N->getOperand(1)))
8320 EVT VT = N->getValueType(0);
8322 // There are 3 commutable operators in the pattern,
8323 // so we have to deal with 8 possible variants of the basic pattern.
8325 auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) {
8326 if (And.getOpcode() != ISD::AND || !And.hasOneUse())
8328 SDValue Xor = And.getOperand(XorIdx);
8329 if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse())
8331 SDValue Xor0 = Xor.getOperand(0);
8332 SDValue Xor1 = Xor.getOperand(1);
8333 // Don't touch 'not' (i.e. where y = -1).
8334 if (isAllOnesOrAllOnesSplat(Xor1))
8337 std::swap(Xor0, Xor1);
8342 M = And.getOperand(XorIdx ? 0 : 1);
8346 SDValue N0 = N->getOperand(0);
8347 SDValue N1 = N->getOperand(1);
8348 if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) &&
8349 !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0))
8352 // Don't do anything if the mask is constant. This should not be reachable.
8353 // InstCombine should have already unfolded this pattern, and DAGCombiner
8354 // probably shouldn't produce it, too.
8355 if (isa<ConstantSDNode>(M.getNode()))
8358 // We can transform if the target has AndNot
8359 if (!TLI.hasAndNot(M))
8364 // If Y is a constant, check that 'andn' works with immediates. Unless M is
8365 // a bitwise not that would already allow ANDN to be used.
8366 if (!TLI.hasAndNot(Y) && !isBitwiseNot(M)) {
8367 assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
8368 // If not, we need to do a bit more work to make sure andn is still used.
8369 SDValue NotX = DAG.getNOT(DL, X, VT);
8370 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M);
8371 SDValue NotLHS = DAG.getNOT(DL, LHS, VT);
8372 SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y);
8373 return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
8376 // If X is a constant and M is a bitwise not, check that 'andn' works with
8378 if (!TLI.hasAndNot(X) && isBitwiseNot(M)) {
8379 assert(TLI.hasAndNot(Y) && "Only mask is a variable? Unreachable.");
8380 // If not, we need to do a bit more work to make sure andn is still used.
8381 SDValue NotM = M.getOperand(0);
8382 SDValue LHS = DAG.getNode(ISD::OR, DL, VT, X, NotM);
8383 SDValue NotY = DAG.getNOT(DL, Y, VT);
8384 SDValue RHS = DAG.getNode(ISD::AND, DL, VT, NotM, NotY);
8385 SDValue NotRHS = DAG.getNOT(DL, RHS, VT);
8386 return DAG.getNode(ISD::AND, DL, VT, LHS, NotRHS);
8389 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
8390 SDValue NotM = DAG.getNOT(DL, M, VT);
8391 SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);
8393 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
8396 SDValue DAGCombiner::visitXOR(SDNode *N) {
8397 SDValue N0 = N->getOperand(0);
8398 SDValue N1 = N->getOperand(1);
8399 EVT VT = N0.getValueType();
8402 // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
8403 if (N0.isUndef() && N1.isUndef())
8404 return DAG.getConstant(0, DL, VT);
8406 // fold (xor x, undef) -> undef
8412 // fold (xor c1, c2) -> c1^c2
8413 if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1}))
8416 // canonicalize constant to RHS
8417 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
8418 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
8419 return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
8422 if (VT.isVector()) {
8423 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
8426 // fold (xor x, 0) -> x, vector edition
8427 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
8431 // fold (xor x, 0) -> x
8432 if (isNullConstant(N1))
8435 if (SDValue NewSel = foldBinOpIntoSelect(N))
8439 if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
8442 // look for 'add-like' folds:
8443 // XOR(N0,MIN_SIGNED_VALUE) == ADD(N0,MIN_SIGNED_VALUE)
8444 if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
8445 isMinSignedConstant(N1))
8446 if (SDValue Combined = visitADDLike(N))
8449 // fold !(x cc y) -> (x !cc y)
8450 unsigned N0Opcode = N0.getOpcode();
8451 SDValue LHS, RHS, CC;
8452 if (TLI.isConstTrueVal(N1) &&
8453 isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/ true)) {
8454 ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
8455 LHS.getValueType());
8456 if (!LegalOperations ||
8457 TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
8460 llvm_unreachable("Unhandled SetCC Equivalent!");
8462 return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
8463 case ISD::SELECT_CC:
8464 return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
8465 N0.getOperand(3), NotCC);
8466 case ISD::STRICT_FSETCC:
8467 case ISD::STRICT_FSETCCS: {
8468 if (N0.hasOneUse()) {
8469 // FIXME Can we handle multiple uses? Could we token factor the chain
8470 // results from the new/old setcc?
8472 DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
8473 N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS);
8474 CombineTo(N, SetCC);
8475 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
8476 recursivelyDeleteUnusedNodes(N0.getNode());
8477 return SDValue(N, 0); // Return N so it doesn't get rechecked!
8485 // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
8486 if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() &&
8487 isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
8488 SDValue V = N0.getOperand(0);
8490 V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V,
8491 DAG.getConstant(1, DL0, V.getValueType()));
8492 AddToWorklist(V.getNode());
8493 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V);
8496 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
8497 if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
8498 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
8499 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
8500 if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) {
8501 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
8502 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
8503 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
8504 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
8505 return DAG.getNode(NewOpcode, DL, VT, N00, N01);
8508 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
8509 if (isAllOnesConstant(N1) && N0.hasOneUse() &&
8510 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
8511 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
8512 if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) {
8513 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
8514 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
8515 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
8516 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
8517 return DAG.getNode(NewOpcode, DL, VT, N00, N01);
8521 // fold (not (neg x)) -> (add X, -1)
8522 // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if
8523 // Y is a constant or the subtract has a single use.
8524 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB &&
8525 isNullConstant(N0.getOperand(0))) {
8526 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
8527 DAG.getAllOnesConstant(DL, VT));
8530 // fold (not (add X, -1)) -> (neg X)
8531 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD &&
8532 isAllOnesOrAllOnesSplat(N0.getOperand(1))) {
8533 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
8537 // fold (xor (and x, y), y) -> (and (not x), y)
8538 if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) {
8539 SDValue X = N0.getOperand(0);
8540 SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
8541 AddToWorklist(NotX.getNode());
8542 return DAG.getNode(ISD::AND, DL, VT, NotX, N1);
8545 // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
8546 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
8547 SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
8548 SDValue S = N0Opcode == ISD::SRA ? N0 : N1;
8549 if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
8550 SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
8551 SDValue S0 = S.getOperand(0);
8552 if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0))
8553 if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
8554 if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
8555 return DAG.getNode(ISD::ABS, DL, VT, S0);
8559 // fold (xor x, x) -> 0
8561 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
8563 // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
8564 // Here is a concrete example of this equivalence:
8566 // i16 shl == 1 << 14 == 16384 == 0b0100000000000000
8567 // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
8571 // i16 ~1 == 0b1111111111111110
8572 // i16 rol(~1, 14) == 0b1011111111111111
8574 // Some additional tips to help conceptualize this transform:
8575 // - Try to see the operation as placing a single zero in a value of all ones.
8576 // - There exists no value for x which would allow the result to contain zero.
8577 // - Values of x larger than the bitwidth are undefined and do not require a
8578 // consistent result.
8579 // - Pushing the zero left requires shifting one bits in from the right.
8580 // A rotate left of ~1 is a nice way of achieving the desired result.
8581 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL &&
8582 isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
8583 return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
8587 // Simplify: xor (op x...), (op y...) -> (op (xor x, y))
8588 if (N0Opcode == N1.getOpcode())
8589 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
8592 if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
8594 if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
8597 // Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable
8598 if (SDValue MM = unfoldMaskedMerge(N))
8601 // Simplify the expression using non-local knowledge.
8602 if (SimplifyDemandedBits(SDValue(N, 0)))
8603 return SDValue(N, 0);
8605 if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
8611 /// If we have a shift-by-constant of a bitwise logic op that itself has a
8612 /// shift-by-constant operand with identical opcode, we may be able to convert
8613 /// that into 2 independent shifts followed by the logic op. This is a
8614 /// throughput improvement.
8615 static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
8616 // Match a one-use bitwise logic op.
8617 SDValue LogicOp = Shift->getOperand(0);
8618 if (!LogicOp.hasOneUse())
8621 unsigned LogicOpcode = LogicOp.getOpcode();
8622 if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
8623 LogicOpcode != ISD::XOR)
8626 // Find a matching one-use shift by constant.
8627 unsigned ShiftOpcode = Shift->getOpcode();
8628 SDValue C1 = Shift->getOperand(1);
8629 ConstantSDNode *C1Node = isConstOrConstSplat(C1);
8630 assert(C1Node && "Expected a shift with constant operand");
8631 const APInt &C1Val = C1Node->getAPIntValue();
8632 auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
8633 const APInt *&ShiftAmtVal) {
8634 if (V.getOpcode() != ShiftOpcode || !V.hasOneUse())
8637 ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
8641 // Capture the shifted operand and shift amount value.
8642 ShiftOp = V.getOperand(0);
8643 ShiftAmtVal = &ShiftCNode->getAPIntValue();
8645 // Shift amount types do not have to match their operand type, so check that
8646 // the constants are the same width.
8647 if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
8650 // The fold is not valid if the sum of the shift values exceeds bitwidth.
8651 if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
8657 // Logic ops are commutative, so check each operand for a match.
8660 if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
8661 Y = LogicOp.getOperand(1);
8662 else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
8663 Y = LogicOp.getOperand(0);
8667 // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
8669 EVT VT = Shift->getValueType(0);
8670 EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
8671 SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
8672 SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
8673 SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
8674 return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
8677 /// Handle transforms common to the three shifts, when the shift amount is a
8679 /// We are looking for: (shift being one of shl/sra/srl)
8680 /// shift (binop X, C0), C1
8681 /// And want to transform into:
8682 /// binop (shift X, C1), (shift C0, C1)
8683 SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
8684 assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand");
8686 // Do not turn a 'not' into a regular xor.
8687 if (isBitwiseNot(N->getOperand(0)))
8690 // The inner binop must be one-use, since we want to replace it.
8691 SDValue LHS = N->getOperand(0);
8692 if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
8695 // TODO: This is limited to early combining because it may reveal regressions
8696 // otherwise. But since we just checked a target hook to see if this is
8697 // desirable, that should have filtered out cases where this interferes
8698 // with some other pattern matching.
8700 if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
8703 // We want to pull some binops through shifts, so that we have (and (shift))
8704 // instead of (shift (and)), likewise for add, or, xor, etc. This sort of
8705 // thing happens with address calculations, so it's important to canonicalize
8707 switch (LHS.getOpcode()) {
8715 if (N->getOpcode() != ISD::SHL)
8716 return SDValue(); // only shl(add) not sr[al](add).
8720 // We require the RHS of the binop to be a constant and not opaque as well.
8721 ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1));
8725 // FIXME: disable this unless the input to the binop is a shift by a constant
8726 // or is copy/select. Enable this in other cases when figure out it's exactly
8728 SDValue BinOpLHSVal = LHS.getOperand(0);
8729 bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL ||
8730 BinOpLHSVal.getOpcode() == ISD::SRA ||
8731 BinOpLHSVal.getOpcode() == ISD::SRL) &&
8732 isa<ConstantSDNode>(BinOpLHSVal.getOperand(1));
8733 bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg ||
8734 BinOpLHSVal.getOpcode() == ISD::SELECT;
8736 if (!IsShiftByConstant && !IsCopyOrSelect)
8739 if (IsCopyOrSelect && N->hasOneUse())
8742 // Fold the constants, shifting the binop RHS by the shift amount.
8744 EVT VT = N->getValueType(0);
8745 SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1),
8747 assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");
8749 SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0),
8751 return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS);
8754 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
8755 assert(N->getOpcode() == ISD::TRUNCATE);
8756 assert(N->getOperand(0).getOpcode() == ISD::AND);
8758 // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
8759 EVT TruncVT = N->getValueType(0);
8760 if (N->hasOneUse() && N->getOperand(0).hasOneUse() &&
8761 TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) {
8762 SDValue N01 = N->getOperand(0).getOperand(1);
8763 if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
8765 SDValue N00 = N->getOperand(0).getOperand(0);
8766 SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
8767 SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
8768 AddToWorklist(Trunc00.getNode());
8769 AddToWorklist(Trunc01.getNode());
8770 return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
8777 SDValue DAGCombiner::visitRotate(SDNode *N) {
8779 SDValue N0 = N->getOperand(0);
8780 SDValue N1 = N->getOperand(1);
8781 EVT VT = N->getValueType(0);
8782 unsigned Bitsize = VT.getScalarSizeInBits();
8784 // fold (rot x, 0) -> x
8785 if (isNullOrNullSplat(N1))
8788 // fold (rot x, c) -> x iff (c % BitSize) == 0
8789 if (isPowerOf2_32(Bitsize) && Bitsize > 1) {
8790 APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1);
8791 if (DAG.MaskedValueIsZero(N1, ModuloMask))
8795 // fold (rot x, c) -> (rot x, c % BitSize)
8796 bool OutOfRange = false;
8797 auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) {
8798 OutOfRange |= C->getAPIntValue().uge(Bitsize);
8801 if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) {
8802 EVT AmtVT = N1.getValueType();
8803 SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT);
8805 DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits}))
8806 return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt);
8809 // rot i16 X, 8 --> bswap X
8810 auto *RotAmtC = isConstOrConstSplat(N1);
8811 if (RotAmtC && RotAmtC->getAPIntValue() == 8 &&
8812 VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT))
8813 return DAG.getNode(ISD::BSWAP, dl, VT, N0);
8815 // Simplify the operands using demanded-bits information.
8816 if (SimplifyDemandedBits(SDValue(N, 0)))
8817 return SDValue(N, 0);
8819 // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
8820 if (N1.getOpcode() == ISD::TRUNCATE &&
8821 N1.getOperand(0).getOpcode() == ISD::AND) {
8822 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
8823 return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
8826 unsigned NextOp = N0.getOpcode();
8828 // fold (rot* (rot* x, c2), c1)
8829 // -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize)) % bitsize)
8830 if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
8831 SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
8832 SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
8833 if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
8834 EVT ShiftVT = C1->getValueType(0);
8835 bool SameSide = (N->getOpcode() == NextOp);
8836 unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
8837 SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
8838 SDValue Norm1 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
8840 SDValue Norm2 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
8841 {N0.getOperand(1), BitsizeC});
8843 if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
8844 CombineOp, dl, ShiftVT, {Norm1, Norm2})) {
8845 SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
8846 ISD::UREM, dl, ShiftVT, {CombinedShift, BitsizeC});
8847 return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
8855 SDValue DAGCombiner::visitSHL(SDNode *N) {
8856 SDValue N0 = N->getOperand(0);
8857 SDValue N1 = N->getOperand(1);
8858 if (SDValue V = DAG.simplifyShift(N0, N1))
8861 EVT VT = N0.getValueType();
8862 EVT ShiftVT = N1.getValueType();
8863 unsigned OpSizeInBits = VT.getScalarSizeInBits();
8865 // fold (shl c1, c2) -> c1<<c2
8866 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
8870 if (VT.isVector()) {
8871 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
8874 BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
8875 // If setcc produces all-one true value then:
8876 // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
8877 if (N1CV && N1CV->isConstant()) {
8878 if (N0.getOpcode() == ISD::AND) {
8879 SDValue N00 = N0->getOperand(0);
8880 SDValue N01 = N0->getOperand(1);
8881 BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);
8883 if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
8884 TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
8885 TargetLowering::ZeroOrNegativeOneBooleanContent) {
8887 DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))
8888 return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
8894 if (SDValue NewSel = foldBinOpIntoSelect(N))
8897 // if (shl x, c) is known to be zero, return 0
8898 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
8899 return DAG.getConstant(0, SDLoc(N), VT);
8901 // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
8902 if (N1.getOpcode() == ISD::TRUNCATE &&
8903 N1.getOperand(0).getOpcode() == ISD::AND) {
8904 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
8905 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
8908 if (SimplifyDemandedBits(SDValue(N, 0)))
8909 return SDValue(N, 0);
8911 // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
8912 if (N0.getOpcode() == ISD::SHL) {
8913 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
8914 ConstantSDNode *RHS) {
8915 APInt c1 = LHS->getAPIntValue();
8916 APInt c2 = RHS->getAPIntValue();
8917 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8918 return (c1 + c2).uge(OpSizeInBits);
8920 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
8921 return DAG.getConstant(0, SDLoc(N), VT);
8923 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
8924 ConstantSDNode *RHS) {
8925 APInt c1 = LHS->getAPIntValue();
8926 APInt c2 = RHS->getAPIntValue();
8927 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8928 return (c1 + c2).ult(OpSizeInBits);
8930 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
8932 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
8933 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
8937 // fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
8938 // For this to be valid, the second form must not preserve any of the bits
8939 // that are shifted out by the inner shift in the first form. This means
8940 // the outer shift size must be >= the number of bits added by the ext.
8941 // As a corollary, we don't care what kind of ext it is.
8942 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
8943 N0.getOpcode() == ISD::ANY_EXTEND ||
8944 N0.getOpcode() == ISD::SIGN_EXTEND) &&
8945 N0.getOperand(0).getOpcode() == ISD::SHL) {
8946 SDValue N0Op0 = N0.getOperand(0);
8947 SDValue InnerShiftAmt = N0Op0.getOperand(1);
8948 EVT InnerVT = N0Op0.getValueType();
8949 uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits();
8951 auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
8952 ConstantSDNode *RHS) {
8953 APInt c1 = LHS->getAPIntValue();
8954 APInt c2 = RHS->getAPIntValue();
8955 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8956 return c2.uge(OpSizeInBits - InnerBitwidth) &&
8957 (c1 + c2).uge(OpSizeInBits);
8959 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange,
8960 /*AllowUndefs*/ false,
8961 /*AllowTypeMismatch*/ true))
8962 return DAG.getConstant(0, SDLoc(N), VT);
8964 auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
8965 ConstantSDNode *RHS) {
8966 APInt c1 = LHS->getAPIntValue();
8967 APInt c2 = RHS->getAPIntValue();
8968 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8969 return c2.uge(OpSizeInBits - InnerBitwidth) &&
8970 (c1 + c2).ult(OpSizeInBits);
8972 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange,
8973 /*AllowUndefs*/ false,
8974 /*AllowTypeMismatch*/ true)) {
8976 SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0));
8977 SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT);
8978 Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1);
8979 return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum);
8983 // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
8984 // Only fold this if the inner zext has no other uses to avoid increasing
8985 // the total number of instructions.
8986 if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
8987 N0.getOperand(0).getOpcode() == ISD::SRL) {
8988 SDValue N0Op0 = N0.getOperand(0);
8989 SDValue InnerShiftAmt = N0Op0.getOperand(1);
8991 auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) {
8992 APInt c1 = LHS->getAPIntValue();
8993 APInt c2 = RHS->getAPIntValue();
8994 zeroExtendToMatch(c1, c2);
8995 return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2);
8997 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual,
8998 /*AllowUndefs*/ false,
8999 /*AllowTypeMismatch*/ true)) {
9001 EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType();
9002 SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT);
9003 NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL);
9004 AddToWorklist(NewSHL.getNode());
9005 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
9009 if (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) {
9010 auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
9011 ConstantSDNode *RHS) {
9012 const APInt &LHSC = LHS->getAPIntValue();
9013 const APInt &RHSC = RHS->getAPIntValue();
9014 return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
9015 LHSC.getZExtValue() <= RHSC.getZExtValue();
9020 // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
9021 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
9022 if (N0->getFlags().hasExact()) {
9023 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
9024 /*AllowUndefs*/ false,
9025 /*AllowTypeMismatch*/ true)) {
9026 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9027 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
9028 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
9030 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
9031 /*AllowUndefs*/ false,
9032 /*AllowTypeMismatch*/ true)) {
9033 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9034 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
9035 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Diff);
9039 // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
9040 // (and (srl x, (sub c1, c2), MASK)
9041 // Only fold this if the inner shift has no other uses -- if it does,
9042 // folding this will increase the total number of instructions.
9043 if (N0.getOpcode() == ISD::SRL &&
9044 (N0.getOperand(1) == N1 || N0.hasOneUse()) &&
9045 TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
9046 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
9047 /*AllowUndefs*/ false,
9048 /*AllowTypeMismatch*/ true)) {
9049 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9050 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
9051 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9052 Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N01);
9053 Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, Diff);
9054 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
9055 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9057 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
9058 /*AllowUndefs*/ false,
9059 /*AllowTypeMismatch*/ true)) {
9060 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9061 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
9062 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9063 Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N1);
9064 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
9065 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9070 // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
9071 if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
9072 isConstantOrConstantVector(N1, /* No Opaques */ true)) {
9074 SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
9075 SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
9076 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
9079 // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
9080 // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
9081 // Variant of version done on multiply, except mul by a power of 2 is turned
9083 if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
9085 isConstantOrConstantVector(N1, /* No Opaques */ true) &&
9086 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
9087 TLI.isDesirableToCommuteWithShift(N, Level)) {
9088 SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
9089 SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
9090 AddToWorklist(Shl0.getNode());
9091 AddToWorklist(Shl1.getNode());
9092 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
9095 // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
9096 if (N0.getOpcode() == ISD::MUL && N0->hasOneUse()) {
9097 SDValue N01 = N0.getOperand(1);
9099 DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, {N01, N1}))
9100 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
9103 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9104 if (N1C && !N1C->isOpaque())
9105 if (SDValue NewSHL = visitShiftByConstant(N))
9108 // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
9109 if (N0.getOpcode() == ISD::VSCALE)
9110 if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) {
9111 const APInt &C0 = N0.getConstantOperandAPInt(0);
9112 const APInt &C1 = NC1->getAPIntValue();
9113 return DAG.getVScale(SDLoc(N), VT, C0 << C1);
9116 // Fold (shl step_vector(C0), C1) to (step_vector(C0 << C1)).
9118 if (N0.getOpcode() == ISD::STEP_VECTOR)
9119 if (ISD::isConstantSplatVector(N1.getNode(), ShlVal)) {
9120 const APInt &C0 = N0.getConstantOperandAPInt(0);
9121 if (ShlVal.ult(C0.getBitWidth())) {
9122 APInt NewStep = C0 << ShlVal;
9123 return DAG.getStepVector(SDLoc(N), VT, NewStep);
9130 // Transform a right shift of a multiply into a multiply-high.
9132 // (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)
9133 // (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)
9134 static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
9135 const TargetLowering &TLI) {
9136 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
9137 "SRL or SRA node is required here!");
9139 // Check the shift amount. Proceed with the transformation if the shift
9140 // amount is constant.
9141 ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));
9147 // The operation feeding into the shift must be a multiply.
9148 SDValue ShiftOperand = N->getOperand(0);
9149 if (ShiftOperand.getOpcode() != ISD::MUL)
9152 // Both operands must be equivalent extend nodes.
9153 SDValue LeftOp = ShiftOperand.getOperand(0);
9154 SDValue RightOp = ShiftOperand.getOperand(1);
9156 bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
9157 bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
9159 if (!IsSignExt && !IsZeroExt)
9162 EVT NarrowVT = LeftOp.getOperand(0).getValueType();
9163 unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
9165 SDValue MulhRightOp;
9166 if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
9167 unsigned ActiveBits = IsSignExt
9168 ? Constant->getAPIntValue().getMinSignedBits()
9169 : Constant->getAPIntValue().getActiveBits();
9170 if (ActiveBits > NarrowVTSize)
9172 MulhRightOp = DAG.getConstant(
9173 Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
9176 if (LeftOp.getOpcode() != RightOp.getOpcode())
9178 // Check that the two extend nodes are the same type.
9179 if (NarrowVT != RightOp.getOperand(0).getValueType())
9181 MulhRightOp = RightOp.getOperand(0);
9184 EVT WideVT = LeftOp.getValueType();
9185 // Proceed with the transformation if the wide types match.
9186 assert((WideVT == RightOp.getValueType()) &&
9187 "Cannot have a multiply node with two different operand types.");
9189 // Proceed with the transformation if the wide type is twice as large
9190 // as the narrow type.
9191 if (WideVT.getScalarSizeInBits() != 2 * NarrowVTSize)
9194 // Check the shift amount with the narrow type size.
9195 // Proceed with the transformation if the shift amount is the width
9196 // of the narrow type.
9197 unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
9198 if (ShiftAmt != NarrowVTSize)
9201 // If the operation feeding into the MUL is a sign extend (sext),
9202 // we use mulhs. Othewise, zero extends (zext) use mulhu.
9203 unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;
9205 // Combine to mulh if mulh is legal/custom for the narrow type on the target.
9206 if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT))
9210 DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), MulhRightOp);
9211 return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT)
9212 : DAG.getZExtOrTrunc(Result, DL, WideVT));
9215 SDValue DAGCombiner::visitSRA(SDNode *N) {
9216 SDValue N0 = N->getOperand(0);
9217 SDValue N1 = N->getOperand(1);
9218 if (SDValue V = DAG.simplifyShift(N0, N1))
9221 EVT VT = N0.getValueType();
9222 unsigned OpSizeInBits = VT.getScalarSizeInBits();
9224 // fold (sra c1, c2) -> (sra c1, c2)
9225 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
9228 // Arithmetic shifting an all-sign-bit value is a no-op.
9229 // fold (sra 0, x) -> 0
9230 // fold (sra -1, x) -> -1
9231 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
9236 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
9239 if (SDValue NewSel = foldBinOpIntoSelect(N))
9242 // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
9244 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9245 if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
9246 unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
9247 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
9249 ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT,
9250 VT.getVectorElementCount());
9251 if (!LegalOperations ||
9252 TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
9253 TargetLowering::Legal)
9254 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
9255 N0.getOperand(0), DAG.getValueType(ExtVT));
9256 // Even if we can't convert to sext_inreg, we might be able to remove
9257 // this shift pair if the input is already sign extended.
9258 if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue())
9259 return N0.getOperand(0);
9262 // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
9263 // clamp (add c1, c2) to max shift.
9264 if (N0.getOpcode() == ISD::SRA) {
9266 EVT ShiftVT = N1.getValueType();
9267 EVT ShiftSVT = ShiftVT.getScalarType();
9268 SmallVector<SDValue, 16> ShiftValues;
9270 auto SumOfShifts = [&](ConstantSDNode *LHS, ConstantSDNode *RHS) {
9271 APInt c1 = LHS->getAPIntValue();
9272 APInt c2 = RHS->getAPIntValue();
9273 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
9274 APInt Sum = c1 + c2;
9276 Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue();
9277 ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT));
9280 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) {
9282 if (N1.getOpcode() == ISD::BUILD_VECTOR)
9283 ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues);
9284 else if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
9285 assert(ShiftValues.size() == 1 &&
9286 "Expected matchBinaryPredicate to return one element for "
9288 ShiftValue = DAG.getSplatVector(ShiftVT, DL, ShiftValues[0]);
9290 ShiftValue = ShiftValues[0];
9291 return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue);
9295 // fold (sra (shl X, m), (sub result_size, n))
9296 // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
9297 // result_size - n != m.
9298 // If truncate is free for the target sext(shl) is likely to result in better
9300 if (N0.getOpcode() == ISD::SHL && N1C) {
9301 // Get the two constanst of the shifts, CN0 = m, CN = n.
9302 const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
9304 LLVMContext &Ctx = *DAG.getContext();
9305 // Determine what the truncate's result bitsize and type would be.
9306 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());
9309 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount());
9311 // Determine the residual right-shift amount.
9312 int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
9314 // If the shift is not a no-op (in which case this should be just a sign
9315 // extend already), the truncated to type is legal, sign_extend is legal
9316 // on that type, and the truncate to that type is both legal and free,
9317 // perform the transform.
9318 if ((ShiftAmt > 0) &&
9319 TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
9320 TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
9321 TLI.isTruncateFree(VT, TruncVT)) {
9323 SDValue Amt = DAG.getConstant(ShiftAmt, DL,
9324 getShiftAmountTy(N0.getOperand(0).getValueType()));
9325 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
9326 N0.getOperand(0), Amt);
9327 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
9329 return DAG.getNode(ISD::SIGN_EXTEND, DL,
9330 N->getValueType(0), Trunc);
9335 // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
9336 // sra (add (shl X, N1C), AddC), N1C -->
9337 // sext (add (trunc X to (width - N1C)), AddC')
9338 // sra (sub AddC, (shl X, N1C)), N1C -->
9339 // sext (sub AddC1',(trunc X to (width - N1C)))
9340 if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB) && N1C &&
9342 bool IsAdd = N0.getOpcode() == ISD::ADD;
9343 SDValue Shl = N0.getOperand(IsAdd ? 0 : 1);
9344 if (Shl.getOpcode() == ISD::SHL && Shl.getOperand(1) == N1 &&
9346 // TODO: AddC does not need to be a splat.
9347 if (ConstantSDNode *AddC =
9348 isConstOrConstSplat(N0.getOperand(IsAdd ? 1 : 0))) {
9349 // Determine what the truncate's type would be and ask the target if
9350 // that is a free operation.
9351 LLVMContext &Ctx = *DAG.getContext();
9352 unsigned ShiftAmt = N1C->getZExtValue();
9353 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt);
9355 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount());
9357 // TODO: The simple type check probably belongs in the default hook
9358 // implementation and/or target-specific overrides (because
9359 // non-simple types likely require masking when legalized), but
9360 // that restriction may conflict with other transforms.
9361 if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&
9362 TLI.isTruncateFree(VT, TruncVT)) {
9364 SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
9366 DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).trunc(
9367 TruncVT.getScalarSizeInBits()),
9371 Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC);
9373 Add = DAG.getNode(ISD::SUB, DL, TruncVT, ShiftC, Trunc);
9374 return DAG.getSExtOrTrunc(Add, DL, VT);
9380 // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
9381 if (N1.getOpcode() == ISD::TRUNCATE &&
9382 N1.getOperand(0).getOpcode() == ISD::AND) {
9383 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
9384 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
9387 // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
9388 // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
9389 // if c1 is equal to the number of bits the trunc removes
9390 // TODO - support non-uniform vector shift amounts.
9391 if (N0.getOpcode() == ISD::TRUNCATE &&
9392 (N0.getOperand(0).getOpcode() == ISD::SRL ||
9393 N0.getOperand(0).getOpcode() == ISD::SRA) &&
9394 N0.getOperand(0).hasOneUse() &&
9395 N0.getOperand(0).getOperand(1).hasOneUse() && N1C) {
9396 SDValue N0Op0 = N0.getOperand(0);
9397 if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
9398 EVT LargeVT = N0Op0.getValueType();
9399 unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits;
9400 if (LargeShift->getAPIntValue() == TruncBits) {
9402 EVT LargeShiftVT = getShiftAmountTy(LargeVT);
9403 SDValue Amt = DAG.getZExtOrTrunc(N1, DL, LargeShiftVT);
9404 Amt = DAG.getNode(ISD::ADD, DL, LargeShiftVT, Amt,
9405 DAG.getConstant(TruncBits, DL, LargeShiftVT));
9407 DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt);
9408 return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
9413 // Simplify, based on bits shifted out of the LHS.
9414 if (SimplifyDemandedBits(SDValue(N, 0)))
9415 return SDValue(N, 0);
9417 // If the sign bit is known to be zero, switch this to a SRL.
9418 if (DAG.SignBitIsZero(N0))
9419 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
9421 if (N1C && !N1C->isOpaque())
9422 if (SDValue NewSRA = visitShiftByConstant(N))
9425 // Try to transform this shift into a multiply-high if
9426 // it matches the appropriate pattern detected in combineShiftToMULH.
9427 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
9430 // Attempt to convert a sra of a load into a narrower sign-extending load.
9431 if (SDValue NarrowLoad = reduceLoadWidth(N))
9437 SDValue DAGCombiner::visitSRL(SDNode *N) {
9438 SDValue N0 = N->getOperand(0);
9439 SDValue N1 = N->getOperand(1);
9440 if (SDValue V = DAG.simplifyShift(N0, N1))
9443 EVT VT = N0.getValueType();
9444 EVT ShiftVT = N1.getValueType();
9445 unsigned OpSizeInBits = VT.getScalarSizeInBits();
9447 // fold (srl c1, c2) -> c1 >>u c2
9448 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
9453 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
9456 if (SDValue NewSel = foldBinOpIntoSelect(N))
9459 // if (srl x, c) is known to be zero, return 0
9460 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9462 DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
9463 return DAG.getConstant(0, SDLoc(N), VT);
9465 // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
9466 if (N0.getOpcode() == ISD::SRL) {
9467 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
9468 ConstantSDNode *RHS) {
9469 APInt c1 = LHS->getAPIntValue();
9470 APInt c2 = RHS->getAPIntValue();
9471 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
9472 return (c1 + c2).uge(OpSizeInBits);
9474 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
9475 return DAG.getConstant(0, SDLoc(N), VT);
9477 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
9478 ConstantSDNode *RHS) {
9479 APInt c1 = LHS->getAPIntValue();
9480 APInt c2 = RHS->getAPIntValue();
9481 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
9482 return (c1 + c2).ult(OpSizeInBits);
9484 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
9486 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
9487 return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
9491 if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
9492 N0.getOperand(0).getOpcode() == ISD::SRL) {
9493 SDValue InnerShift = N0.getOperand(0);
9494 // TODO - support non-uniform vector shift amounts.
9495 if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) {
9496 uint64_t c1 = N001C->getZExtValue();
9497 uint64_t c2 = N1C->getZExtValue();
9498 EVT InnerShiftVT = InnerShift.getValueType();
9499 EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType();
9500 uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
9501 // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2)))
9502 // This is only valid if the OpSizeInBits + c1 = size of inner shift.
9503 if (c1 + OpSizeInBits == InnerShiftSize) {
9505 if (c1 + c2 >= InnerShiftSize)
9506 return DAG.getConstant(0, DL, VT);
9507 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
9508 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
9509 InnerShift.getOperand(0), NewShiftAmt);
9510 return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift);
9512 // In the more general case, we can clear the high bits after the shift:
9513 // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
9514 if (N0.hasOneUse() && InnerShift.hasOneUse() &&
9515 c1 + c2 < InnerShiftSize) {
9517 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
9518 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
9519 InnerShift.getOperand(0), NewShiftAmt);
9520 SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize,
9523 SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask);
9524 return DAG.getNode(ISD::TRUNCATE, DL, VT, And);
9529 // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
9530 // (and (srl x, (sub c2, c1), MASK)
9531 if (N0.getOpcode() == ISD::SHL &&
9532 (N0.getOperand(1) == N1 || N0->hasOneUse()) &&
9533 TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
9534 auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
9535 ConstantSDNode *RHS) {
9536 const APInt &LHSC = LHS->getAPIntValue();
9537 const APInt &RHSC = RHS->getAPIntValue();
9538 return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
9539 LHSC.getZExtValue() <= RHSC.getZExtValue();
9541 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
9542 /*AllowUndefs*/ false,
9543 /*AllowTypeMismatch*/ true)) {
9545 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9546 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
9547 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9548 Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
9549 Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
9550 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
9551 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9553 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
9554 /*AllowUndefs*/ false,
9555 /*AllowTypeMismatch*/ true)) {
9557 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9558 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
9559 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9560 Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
9561 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
9562 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9566 // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
9567 // TODO - support non-uniform vector shift amounts.
9568 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
9569 // Shifting in all undef bits?
9570 EVT SmallVT = N0.getOperand(0).getValueType();
9571 unsigned BitSize = SmallVT.getScalarSizeInBits();
9572 if (N1C->getAPIntValue().uge(BitSize))
9573 return DAG.getUNDEF(VT);
9575 if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
9576 uint64_t ShiftAmt = N1C->getZExtValue();
9578 SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
9580 DAG.getConstant(ShiftAmt, DL0,
9581 getShiftAmountTy(SmallVT)));
9582 AddToWorklist(SmallShift.getNode());
9583 APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
9585 return DAG.getNode(ISD::AND, DL, VT,
9586 DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
9587 DAG.getConstant(Mask, DL, VT));
9591 // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign
9592 // bit, which is unmodified by sra.
9593 if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) {
9594 if (N0.getOpcode() == ISD::SRA)
9595 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
9598 // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
9599 if (N1C && N0.getOpcode() == ISD::CTLZ &&
9600 N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
9601 KnownBits Known = DAG.computeKnownBits(N0.getOperand(0));
9603 // If any of the input bits are KnownOne, then the input couldn't be all
9604 // zeros, thus the result of the srl will always be zero.
9605 if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
9607 // If all of the bits input the to ctlz node are known to be zero, then
9608 // the result of the ctlz is "32" and the result of the shift is one.
9609 APInt UnknownBits = ~Known.Zero;
9610 if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);
9612 // Otherwise, check to see if there is exactly one bit input to the ctlz.
9613 if (UnknownBits.isPowerOf2()) {
9614 // Okay, we know that only that the single bit specified by UnknownBits
9615 // could be set on input to the CTLZ node. If this bit is set, the SRL
9616 // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
9617 // to an SRL/XOR pair, which is likely to simplify more.
9618 unsigned ShAmt = UnknownBits.countTrailingZeros();
9619 SDValue Op = N0.getOperand(0);
9623 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
9624 DAG.getConstant(ShAmt, DL,
9625 getShiftAmountTy(Op.getValueType())));
9626 AddToWorklist(Op.getNode());
9630 return DAG.getNode(ISD::XOR, DL, VT,
9631 Op, DAG.getConstant(1, DL, VT));
9635 // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
9636 if (N1.getOpcode() == ISD::TRUNCATE &&
9637 N1.getOperand(0).getOpcode() == ISD::AND) {
9638 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
9639 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
9642 // fold operands of srl based on knowledge that the low bits are not
9644 if (SimplifyDemandedBits(SDValue(N, 0)))
9645 return SDValue(N, 0);
9647 if (N1C && !N1C->isOpaque())
9648 if (SDValue NewSRL = visitShiftByConstant(N))
9651 // Attempt to convert a srl of a load into a narrower zero-extending load.
9652 if (SDValue NarrowLoad = reduceLoadWidth(N))
9655 // Here is a common situation. We want to optimize:
9658 // %b = and i32 %a, 2
9659 // %c = srl i32 %b, 1
9660 // brcond i32 %c ...
9666 // %c = setcc eq %b, 0
9669 // However when after the source operand of SRL is optimized into AND, the SRL
9670 // itself may not be optimized further. Look for it and add the BRCOND into
9672 if (N->hasOneUse()) {
9673 SDNode *Use = *N->use_begin();
9674 if (Use->getOpcode() == ISD::BRCOND)
9676 else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
9677 // Also look pass the truncate.
9678 Use = *Use->use_begin();
9679 if (Use->getOpcode() == ISD::BRCOND)
9684 // Try to transform this shift into a multiply-high if
9685 // it matches the appropriate pattern detected in combineShiftToMULH.
9686 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
9692 SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
9693 EVT VT = N->getValueType(0);
9694 SDValue N0 = N->getOperand(0);
9695 SDValue N1 = N->getOperand(1);
9696 SDValue N2 = N->getOperand(2);
9697 bool IsFSHL = N->getOpcode() == ISD::FSHL;
9698 unsigned BitWidth = VT.getScalarSizeInBits();
9700 // fold (fshl N0, N1, 0) -> N0
9701 // fold (fshr N0, N1, 0) -> N1
9702 if (isPowerOf2_32(BitWidth))
9703 if (DAG.MaskedValueIsZero(
9704 N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1)))
9705 return IsFSHL ? N0 : N1;
9707 auto IsUndefOrZero = [](SDValue V) {
9708 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
9711 // TODO - support non-uniform vector shift amounts.
9712 if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
9713 EVT ShAmtTy = N2.getValueType();
9715 // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
9716 if (Cst->getAPIntValue().uge(BitWidth)) {
9717 uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
9718 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
9719 DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy));
9722 unsigned ShAmt = Cst->getZExtValue();
9724 return IsFSHL ? N0 : N1;
9726 // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C)
9727 // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C)
9728 // fold fshl(N0, undef_or_zero, C) -> shl(N0, C)
9729 // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C)
9730 if (IsUndefOrZero(N0))
9731 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1,
9732 DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt,
9733 SDLoc(N), ShAmtTy));
9734 if (IsUndefOrZero(N1))
9735 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
9736 DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
9737 SDLoc(N), ShAmtTy));
9739 // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
9740 // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
9741 // TODO - bigendian support once we have test coverage.
9742 // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
9743 // TODO - permit LHS EXTLOAD if extensions are shifted out.
9744 if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
9745 !DAG.getDataLayout().isBigEndian()) {
9746 auto *LHS = dyn_cast<LoadSDNode>(N0);
9747 auto *RHS = dyn_cast<LoadSDNode>(N1);
9748 if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
9749 LHS->getAddressSpace() == RHS->getAddressSpace() &&
9750 (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
9751 ISD::isNON_EXTLoad(LHS)) {
9752 if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
9755 IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
9756 Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff);
9758 if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
9759 RHS->getAddressSpace(), NewAlign,
9760 RHS->getMemOperand()->getFlags(), &Fast) &&
9762 SDValue NewPtr = DAG.getMemBasePlusOffset(
9763 RHS->getBasePtr(), TypeSize::Fixed(PtrOff), DL);
9764 AddToWorklist(NewPtr.getNode());
9765 SDValue Load = DAG.getLoad(
9766 VT, DL, RHS->getChain(), NewPtr,
9767 RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
9768 RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
9769 // Replace the old load's chain with the new load's chain.
9770 WorklistRemover DeadNodes(*this);
9771 DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
9779 // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)
9780 // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2)
9781 // iff We know the shift amount is in range.
9782 // TODO: when is it worth doing SUB(BW, N2) as well?
9783 if (isPowerOf2_32(BitWidth)) {
9784 APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1);
9785 if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
9786 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2);
9787 if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
9788 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2);
9791 // fold (fshl N0, N0, N2) -> (rotl N0, N2)
9792 // fold (fshr N0, N0, N2) -> (rotr N0, N2)
9793 // TODO: Investigate flipping this rotate if only one is legal, if funnel shift
9794 // is legal as well we might be better off avoiding non-constant (BW - N2).
9795 unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
9796 if (N0 == N1 && hasOperation(RotOpc, VT))
9797 return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
9799 // Simplify, based on bits shifted out of N0/N1.
9800 if (SimplifyDemandedBits(SDValue(N, 0)))
9801 return SDValue(N, 0);
9806 SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
9807 SDValue N0 = N->getOperand(0);
9808 SDValue N1 = N->getOperand(1);
9809 if (SDValue V = DAG.simplifyShift(N0, N1))
9812 EVT VT = N0.getValueType();
9814 // fold (*shlsat c1, c2) -> c1<<c2
9816 DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, {N0, N1}))
9819 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9821 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) {
9822 // fold (sshlsat x, c) -> (shl x, c)
9823 if (N->getOpcode() == ISD::SSHLSAT && N1C &&
9824 N1C->getAPIntValue().ult(DAG.ComputeNumSignBits(N0)))
9825 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
9827 // fold (ushlsat x, c) -> (shl x, c)
9828 if (N->getOpcode() == ISD::USHLSAT && N1C &&
9829 N1C->getAPIntValue().ule(
9830 DAG.computeKnownBits(N0).countMinLeadingZeros()))
9831 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
9837 // Given a ABS node, detect the following pattern:
9838 // (ABS (SUB (EXTEND a), (EXTEND b))).
9839 // Generates UABD/SABD instruction.
9840 static SDValue combineABSToABD(SDNode *N, SelectionDAG &DAG,
9841 const TargetLowering &TLI) {
9842 SDValue AbsOp1 = N->getOperand(0);
9845 if (AbsOp1.getOpcode() != ISD::SUB)
9848 Op0 = AbsOp1.getOperand(0);
9849 Op1 = AbsOp1.getOperand(1);
9851 unsigned Opc0 = Op0.getOpcode();
9852 // Check if the operands of the sub are (zero|sign)-extended.
9853 if (Opc0 != Op1.getOpcode() ||
9854 (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
9857 EVT VT = N->getValueType(0);
9858 EVT VT1 = Op0.getOperand(0).getValueType();
9859 EVT VT2 = Op1.getOperand(0).getValueType();
9860 unsigned ABDOpcode = (Opc0 == ISD::SIGN_EXTEND) ? ISD::ABDS : ISD::ABDU;
9862 // fold abs(sext(x) - sext(y)) -> zext(abds(x, y))
9863 // fold abs(zext(x) - zext(y)) -> zext(abdu(x, y))
9864 // NOTE: Extensions must be equivalent.
9865 if (VT1 == VT2 && TLI.isOperationLegalOrCustom(ABDOpcode, VT1)) {
9866 Op0 = Op0.getOperand(0);
9867 Op1 = Op1.getOperand(0);
9868 SDValue ABD = DAG.getNode(ABDOpcode, SDLoc(N), VT1, Op0, Op1);
9869 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, ABD);
9872 // fold abs(sext(x) - sext(y)) -> abds(sext(x), sext(y))
9873 // fold abs(zext(x) - zext(y)) -> abdu(zext(x), zext(y))
9874 if (TLI.isOperationLegalOrCustom(ABDOpcode, VT))
9875 return DAG.getNode(ABDOpcode, SDLoc(N), VT, Op0, Op1);
9880 SDValue DAGCombiner::visitABS(SDNode *N) {
9881 SDValue N0 = N->getOperand(0);
9882 EVT VT = N->getValueType(0);
9884 // fold (abs c1) -> c2
9885 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9886 return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
9887 // fold (abs (abs x)) -> (abs x)
9888 if (N0.getOpcode() == ISD::ABS)
9890 // fold (abs x) -> x iff not-negative
9891 if (DAG.SignBitIsZero(N0))
9894 if (SDValue ABD = combineABSToABD(N, DAG, TLI))
9900 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
9901 SDValue N0 = N->getOperand(0);
9902 EVT VT = N->getValueType(0);
9905 // fold (bswap c1) -> c2
9906 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9907 return DAG.getNode(ISD::BSWAP, DL, VT, N0);
9908 // fold (bswap (bswap x)) -> x
9909 if (N0.getOpcode() == ISD::BSWAP)
9910 return N0.getOperand(0);
9912 // Canonicalize bswap(bitreverse(x)) -> bitreverse(bswap(x)). If bitreverse
9913 // isn't supported, it will be expanded to bswap followed by a manual reversal
9914 // of bits in each byte. By placing bswaps before bitreverse, we can remove
9915 // the two bswaps if the bitreverse gets expanded.
9916 if (N0.getOpcode() == ISD::BITREVERSE && N0.hasOneUse()) {
9917 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
9918 return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
9921 // fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2))))))
9922 // iff x >= bw/2 (i.e. lower half is known zero)
9923 unsigned BW = VT.getScalarSizeInBits();
9924 if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) {
9925 auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9926 EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2);
9927 if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
9928 ShAmt->getZExtValue() >= (BW / 2) &&
9929 (ShAmt->getZExtValue() % 16) == 0 && TLI.isTypeLegal(HalfVT) &&
9930 TLI.isTruncateFree(VT, HalfVT) &&
9931 (!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) {
9932 SDValue Res = N0.getOperand(0);
9933 if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
9934 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
9935 DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
9936 Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
9937 Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
9938 return DAG.getZExtOrTrunc(Res, DL, VT);
9942 // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as
9943 // inverse-shift-of-bswap:
9944 // bswap (X u<< C) --> (bswap X) u>> C
9945 // bswap (X u>> C) --> (bswap X) u<< C
9946 if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
9948 auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9949 if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
9950 ShAmt->getZExtValue() % 8 == 0) {
9951 SDValue NewSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
9952 unsigned InverseShift = N0.getOpcode() == ISD::SHL ? ISD::SRL : ISD::SHL;
9953 return DAG.getNode(InverseShift, DL, VT, NewSwap, N0.getOperand(1));
9960 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
9961 SDValue N0 = N->getOperand(0);
9962 EVT VT = N->getValueType(0);
9964 // fold (bitreverse c1) -> c2
9965 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9966 return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
9967 // fold (bitreverse (bitreverse x)) -> x
9968 if (N0.getOpcode() == ISD::BITREVERSE)
9969 return N0.getOperand(0);
9973 SDValue DAGCombiner::visitCTLZ(SDNode *N) {
9974 SDValue N0 = N->getOperand(0);
9975 EVT VT = N->getValueType(0);
9977 // fold (ctlz c1) -> c2
9978 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9979 return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
9981 // If the value is known never to be zero, switch to the undef version.
9982 if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
9983 if (DAG.isKnownNeverZero(N0))
9984 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
9990 SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
9991 SDValue N0 = N->getOperand(0);
9992 EVT VT = N->getValueType(0);
9994 // fold (ctlz_zero_undef c1) -> c2
9995 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9996 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
10000 SDValue DAGCombiner::visitCTTZ(SDNode *N) {
10001 SDValue N0 = N->getOperand(0);
10002 EVT VT = N->getValueType(0);
10004 // fold (cttz c1) -> c2
10005 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
10006 return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
10008 // If the value is known never to be zero, switch to the undef version.
10009 if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
10010 if (DAG.isKnownNeverZero(N0))
10011 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
10017 SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
10018 SDValue N0 = N->getOperand(0);
10019 EVT VT = N->getValueType(0);
10021 // fold (cttz_zero_undef c1) -> c2
10022 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
10023 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
10027 SDValue DAGCombiner::visitCTPOP(SDNode *N) {
10028 SDValue N0 = N->getOperand(0);
10029 EVT VT = N->getValueType(0);
10031 // fold (ctpop c1) -> c2
10032 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
10033 return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
10037 // FIXME: This should be checking for no signed zeros on individual operands, as
10038 // well as no nans.
10039 static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
10041 const TargetLowering &TLI) {
10042 const TargetOptions &Options = DAG.getTarget().Options;
10043 EVT VT = LHS.getValueType();
10045 return Options.NoSignedZerosFPMath && VT.isFloatingPoint() &&
10046 TLI.isProfitableToCombineMinNumMaxNum(VT) &&
10047 DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS);
10050 /// Generate Min/Max node
10051 static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
10052 SDValue RHS, SDValue True, SDValue False,
10053 ISD::CondCode CC, const TargetLowering &TLI,
10054 SelectionDAG &DAG) {
10055 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
10058 EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
10065 case ISD::SETULE: {
10066 // Since it's known never nan to get here already, either fminnum or
10067 // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
10068 // expanded in terms of it.
10069 unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
10070 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
10071 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
10073 unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
10074 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
10075 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
10083 case ISD::SETUGE: {
10084 unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
10085 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
10086 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
10088 unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
10089 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
10090 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
10098 /// If a (v)select has a condition value that is a sign-bit test, try to smear
10099 /// the condition operand sign-bit across the value width and use it as a mask.
10100 static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
10101 SDValue Cond = N->getOperand(0);
10102 SDValue C1 = N->getOperand(1);
10103 SDValue C2 = N->getOperand(2);
10104 if (!isConstantOrConstantVector(C1) || !isConstantOrConstantVector(C2))
10107 EVT VT = N->getValueType(0);
10108 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() ||
10109 VT != Cond.getOperand(0).getValueType())
10112 // The inverted-condition + commuted-select variants of these patterns are
10113 // canonicalized to these forms in IR.
10114 SDValue X = Cond.getOperand(0);
10115 SDValue CondC = Cond.getOperand(1);
10116 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
10117 if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
10118 isAllOnesOrAllOnesSplat(C2)) {
10119 // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1
10121 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
10122 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
10123 return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
10125 if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
10126 // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
10128 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
10129 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
10130 return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
10135 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
10136 SDValue Cond = N->getOperand(0);
10137 SDValue N1 = N->getOperand(1);
10138 SDValue N2 = N->getOperand(2);
10139 EVT VT = N->getValueType(0);
10140 EVT CondVT = Cond.getValueType();
10143 if (!VT.isInteger())
10146 auto *C1 = dyn_cast<ConstantSDNode>(N1);
10147 auto *C2 = dyn_cast<ConstantSDNode>(N2);
10151 // Only do this before legalization to avoid conflicting with target-specific
10152 // transforms in the other direction (create a select from a zext/sext). There
10153 // is also a target-independent combine here in DAGCombiner in the other
10154 // direction for (select Cond, -1, 0) when the condition is not i1.
10155 if (CondVT == MVT::i1 && !LegalOperations) {
10156 if (C1->isZero() && C2->isOne()) {
10157 // select Cond, 0, 1 --> zext (!Cond)
10158 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
10160 NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
10163 if (C1->isZero() && C2->isAllOnes()) {
10164 // select Cond, 0, -1 --> sext (!Cond)
10165 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
10167 NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
10170 if (C1->isOne() && C2->isZero()) {
10171 // select Cond, 1, 0 --> zext (Cond)
10173 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
10176 if (C1->isAllOnes() && C2->isZero()) {
10177 // select Cond, -1, 0 --> sext (Cond)
10179 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
10183 // Use a target hook because some targets may prefer to transform in the
10184 // other direction.
10185 if (TLI.convertSelectOfConstantsToMath(VT)) {
10186 // For any constants that differ by 1, we can transform the select into an
10188 const APInt &C1Val = C1->getAPIntValue();
10189 const APInt &C2Val = C2->getAPIntValue();
10190 if (C1Val - 1 == C2Val) {
10191 // select Cond, C1, C1-1 --> add (zext Cond), C1-1
10193 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
10194 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
10196 if (C1Val + 1 == C2Val) {
10197 // select Cond, C1, C1+1 --> add (sext Cond), C1+1
10199 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
10200 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
10203 // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
10204 if (C1Val.isPowerOf2() && C2Val.isZero()) {
10206 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
10208 DAG.getShiftAmountConstant(C1Val.exactLogBase2(), VT, DL);
10209 return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
10212 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
10219 // fold (select Cond, 0, 1) -> (xor Cond, 1)
10220 // We can't do this reliably if integer based booleans have different contents
10221 // to floating point based booleans. This is because we can't tell whether we
10222 // have an integer-based boolean or a floating-point-based boolean unless we
10223 // can find the SETCC that produced it and inspect its operands. This is
10224 // fairly easy if C is the SETCC node, but it can potentially be
10225 // undiscoverable (or not reasonably discoverable). For example, it could be
10226 // in another basic block or it could require searching a complicated
10228 if (CondVT.isInteger() &&
10229 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) ==
10230 TargetLowering::ZeroOrOneBooleanContent &&
10231 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) ==
10232 TargetLowering::ZeroOrOneBooleanContent &&
10233 C1->isZero() && C2->isOne()) {
10235 DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
10236 if (VT.bitsEq(CondVT))
10238 return DAG.getZExtOrTrunc(NotCond, DL, VT);
10244 static SDValue foldBoolSelectToLogic(SDNode *N, SelectionDAG &DAG) {
10245 assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT) &&
10246 "Expected a (v)select");
10247 SDValue Cond = N->getOperand(0);
10248 SDValue T = N->getOperand(1), F = N->getOperand(2);
10249 EVT VT = N->getValueType(0);
10250 if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1)
10253 // select Cond, Cond, F --> or Cond, F
10254 // select Cond, 1, F --> or Cond, F
10255 if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true))
10256 return DAG.getNode(ISD::OR, SDLoc(N), VT, Cond, F);
10258 // select Cond, T, Cond --> and Cond, T
10259 // select Cond, T, 0 --> and Cond, T
10260 if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true))
10261 return DAG.getNode(ISD::AND, SDLoc(N), VT, Cond, T);
10263 // select Cond, T, 1 --> or (not Cond), T
10264 if (isOneOrOneSplat(F, /* AllowUndefs */ true)) {
10265 SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT);
10266 return DAG.getNode(ISD::OR, SDLoc(N), VT, NotCond, T);
10269 // select Cond, 0, F --> and (not Cond), F
10270 if (isNullOrNullSplat(T, /* AllowUndefs */ true)) {
10271 SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT);
10272 return DAG.getNode(ISD::AND, SDLoc(N), VT, NotCond, F);
10278 static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
10279 SDValue N0 = N->getOperand(0);
10280 SDValue N1 = N->getOperand(1);
10281 SDValue N2 = N->getOperand(2);
10282 EVT VT = N->getValueType(0);
10283 if (N0.getOpcode() != ISD::SETCC || !N0.hasOneUse())
10286 SDValue Cond0 = N0.getOperand(0);
10287 SDValue Cond1 = N0.getOperand(1);
10288 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
10289 if (VT != Cond0.getValueType())
10292 // Match a signbit check of Cond0 as "Cond0 s<0". Swap select operands if the
10293 // compare is inverted from that pattern ("Cond0 s> -1").
10294 if (CC == ISD::SETLT && isNullOrNullSplat(Cond1))
10295 ; // This is the pattern we are looking for.
10296 else if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(Cond1))
10301 // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1
10302 if (isNullOrNullSplat(N2)) {
10304 SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
10305 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
10306 return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
10309 // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2
10310 if (isAllOnesOrAllOnesSplat(N1)) {
10312 SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
10313 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
10314 return DAG.getNode(ISD::OR, DL, VT, Sra, N2);
10317 // If we have to invert the sign bit mask, only do that transform if the
10318 // target has a bitwise 'and not' instruction (the invert is free).
10319 // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & N2
10320 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10321 if (isNullOrNullSplat(N1) && TLI.hasAndNot(N1)) {
10323 SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
10324 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
10325 SDValue Not = DAG.getNOT(DL, Sra, VT);
10326 return DAG.getNode(ISD::AND, DL, VT, Not, N2);
10329 // TODO: There's another pattern in this family, but it may require
10330 // implementing hasOrNot() to check for profitability:
10331 // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | N2
10336 SDValue DAGCombiner::visitSELECT(SDNode *N) {
10337 SDValue N0 = N->getOperand(0);
10338 SDValue N1 = N->getOperand(1);
10339 SDValue N2 = N->getOperand(2);
10340 EVT VT = N->getValueType(0);
10341 EVT VT0 = N0.getValueType();
10343 SDNodeFlags Flags = N->getFlags();
10345 if (SDValue V = DAG.simplifySelect(N0, N1, N2))
10348 if (SDValue V = foldSelectOfConstants(N))
10351 if (SDValue V = foldBoolSelectToLogic(N, DAG))
10354 // If we can fold this based on the true/false value, do so.
10355 if (SimplifySelectOps(N, N1, N2))
10356 return SDValue(N, 0); // Don't revisit N.
10358 if (VT0 == MVT::i1) {
10359 // The code in this block deals with the following 2 equivalences:
10360 // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
10361 // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
10362 // The target can specify its preferred form with the
10363 // shouldNormalizeToSelectSequence() callback. However we always transform
10364 // to the right anyway if we find the inner select exists in the DAG anyway
10365 // and we always transform to the left side if we know that we can further
10366 // optimize the combination of the conditions.
10367 bool normalizeToSequence =
10368 TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
10369 // select (and Cond0, Cond1), X, Y
10370 // -> select Cond0, (select Cond1, X, Y), Y
10371 if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
10372 SDValue Cond0 = N0->getOperand(0);
10373 SDValue Cond1 = N0->getOperand(1);
10374 SDValue InnerSelect =
10375 DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags);
10376 if (normalizeToSequence || !InnerSelect.use_empty())
10377 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
10378 InnerSelect, N2, Flags);
10379 // Cleanup on failure.
10380 if (InnerSelect.use_empty())
10381 recursivelyDeleteUnusedNodes(InnerSelect.getNode());
10383 // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
10384 if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
10385 SDValue Cond0 = N0->getOperand(0);
10386 SDValue Cond1 = N0->getOperand(1);
10387 SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(),
10388 Cond1, N1, N2, Flags);
10389 if (normalizeToSequence || !InnerSelect.use_empty())
10390 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
10391 InnerSelect, Flags);
10392 // Cleanup on failure.
10393 if (InnerSelect.use_empty())
10394 recursivelyDeleteUnusedNodes(InnerSelect.getNode());
10397 // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
10398 if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
10399 SDValue N1_0 = N1->getOperand(0);
10400 SDValue N1_1 = N1->getOperand(1);
10401 SDValue N1_2 = N1->getOperand(2);
10402 if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
10403 // Create the actual and node if we can generate good code for it.
10404 if (!normalizeToSequence) {
10405 SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
10406 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1,
10409 // Otherwise see if we can optimize the "and" to a better pattern.
10410 if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
10411 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
10416 // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
10417 if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
10418 SDValue N2_0 = N2->getOperand(0);
10419 SDValue N2_1 = N2->getOperand(1);
10420 SDValue N2_2 = N2->getOperand(2);
10421 if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
10422 // Create the actual or node if we can generate good code for it.
10423 if (!normalizeToSequence) {
10424 SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
10425 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,
10428 // Otherwise see if we can optimize to a better pattern.
10429 if (SDValue Combined = visitORLike(N0, N2_0, N))
10430 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
10436 // select (not Cond), N1, N2 -> select Cond, N2, N1
10437 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
10438 SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1);
10439 SelectOp->setFlags(Flags);
10443 // Fold selects based on a setcc into other things, such as min/max/abs.
10444 if (N0.getOpcode() == ISD::SETCC) {
10445 SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1);
10446 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
10448 // select (fcmp lt x, y), x, y -> fminnum x, y
10449 // select (fcmp gt x, y), x, y -> fmaxnum x, y
10451 // This is OK if we don't care what happens if either operand is a NaN.
10452 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI))
10453 if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2,
10457 // Use 'unsigned add with overflow' to optimize an unsigned saturating add.
10458 // This is conservatively limited to pre-legal-operations to give targets
10459 // a chance to reverse the transform if they want to do that. Also, it is
10460 // unlikely that the pattern would be formed late, so it's probably not
10461 // worth going through the other checks.
10462 if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) &&
10463 CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) &&
10464 N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) {
10465 auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1));
10466 auto *NotC = dyn_cast<ConstantSDNode>(Cond1);
10467 if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) {
10468 // select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) -->
10469 // uaddo Cond0, C; select uaddo.1, -1, uaddo.0
10471 // The IR equivalent of this transform would have this form:
10473 // %c = icmp ugt %x, ~C
10474 // %r = select %c, -1, %a
10476 // %u = call {iN,i1} llvm.uadd.with.overflow(%x, C)
10477 // %u0 = extractvalue %u, 0
10478 // %u1 = extractvalue %u, 1
10479 // %r = select %u1, -1, %u0
10480 SDVTList VTs = DAG.getVTList(VT, VT0);
10481 SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1));
10482 return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0));
10486 if (TLI.isOperationLegal(ISD::SELECT_CC, VT) ||
10487 (!LegalOperations &&
10488 TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
10489 // Any flags available in a select/setcc fold will be on the setcc as they
10490 // migrated from fcmp
10491 Flags = N0->getFlags();
10492 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
10493 N2, N0.getOperand(2));
10494 SelectNode->setFlags(Flags);
10498 if (SDValue NewSel = SimplifySelect(DL, N0, N1, N2))
10502 if (!VT.isVector())
10503 if (SDValue BinOp = foldSelectOfBinops(N))
10509 // This function assumes all the vselect's arguments are CONCAT_VECTOR
10510 // nodes and that the condition is a BV of ConstantSDNodes (or undefs).
10511 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
10513 SDValue Cond = N->getOperand(0);
10514 SDValue LHS = N->getOperand(1);
10515 SDValue RHS = N->getOperand(2);
10516 EVT VT = N->getValueType(0);
10517 int NumElems = VT.getVectorNumElements();
10518 assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
10519 RHS.getOpcode() == ISD::CONCAT_VECTORS &&
10520 Cond.getOpcode() == ISD::BUILD_VECTOR);
10522 // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
10523 // binary ones here.
10524 if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
10527 // We're sure we have an even number of elements due to the
10528 // concat_vectors we have as arguments to vselect.
10529 // Skip BV elements until we find one that's not an UNDEF
10530 // After we find an UNDEF element, keep looping until we get to half the
10531 // length of the BV and see if all the non-undef nodes are the same.
10532 ConstantSDNode *BottomHalf = nullptr;
10533 for (int i = 0; i < NumElems / 2; ++i) {
10534 if (Cond->getOperand(i)->isUndef())
10537 if (BottomHalf == nullptr)
10538 BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
10539 else if (Cond->getOperand(i).getNode() != BottomHalf)
10543 // Do the same for the second half of the BuildVector
10544 ConstantSDNode *TopHalf = nullptr;
10545 for (int i = NumElems / 2; i < NumElems; ++i) {
10546 if (Cond->getOperand(i)->isUndef())
10549 if (TopHalf == nullptr)
10550 TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
10551 else if (Cond->getOperand(i).getNode() != TopHalf)
10555 assert(TopHalf && BottomHalf &&
10556 "One half of the selector was all UNDEFs and the other was all the "
10557 "same value. This should have been addressed before this function.");
10558 return DAG.getNode(
10559 ISD::CONCAT_VECTORS, DL, VT,
10560 BottomHalf->isZero() ? RHS->getOperand(0) : LHS->getOperand(0),
10561 TopHalf->isZero() ? RHS->getOperand(1) : LHS->getOperand(1));
10564 bool refineUniformBase(SDValue &BasePtr, SDValue &Index, bool IndexIsScaled,
10565 SelectionDAG &DAG) {
10566 if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD)
10569 // Only perform the transformation when existing operands can be reused.
10573 // For now we check only the LHS of the add.
10574 SDValue LHS = Index.getOperand(0);
10575 SDValue SplatVal = DAG.getSplatValue(LHS);
10576 if (!SplatVal || SplatVal.getValueType() != BasePtr.getValueType())
10579 BasePtr = SplatVal;
10580 Index = Index.getOperand(1);
10584 // Fold sext/zext of index into index type.
10585 bool refineIndexType(SDValue &Index, ISD::MemIndexType &IndexType, EVT DataVT,
10586 SelectionDAG &DAG) {
10587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10589 // It's always safe to look through zero extends.
10590 if (Index.getOpcode() == ISD::ZERO_EXTEND) {
10591 SDValue Op = Index.getOperand(0);
10592 if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
10593 IndexType = ISD::UNSIGNED_SCALED;
10597 if (ISD::isIndexTypeSigned(IndexType)) {
10598 IndexType = ISD::UNSIGNED_SCALED;
10603 // It's only safe to look through sign extends when Index is signed.
10604 if (Index.getOpcode() == ISD::SIGN_EXTEND &&
10605 ISD::isIndexTypeSigned(IndexType)) {
10606 SDValue Op = Index.getOperand(0);
10607 if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
10616 SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
10617 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
10618 SDValue Mask = MSC->getMask();
10619 SDValue Chain = MSC->getChain();
10620 SDValue Index = MSC->getIndex();
10621 SDValue Scale = MSC->getScale();
10622 SDValue StoreVal = MSC->getValue();
10623 SDValue BasePtr = MSC->getBasePtr();
10624 ISD::MemIndexType IndexType = MSC->getIndexType();
10627 // Zap scatters with a zero mask.
10628 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10631 if (refineUniformBase(BasePtr, Index, MSC->isIndexScaled(), DAG)) {
10632 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
10633 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
10634 DL, Ops, MSC->getMemOperand(), IndexType,
10635 MSC->isTruncatingStore());
10638 if (refineIndexType(Index, IndexType, StoreVal.getValueType(), DAG)) {
10639 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
10640 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
10641 DL, Ops, MSC->getMemOperand(), IndexType,
10642 MSC->isTruncatingStore());
10648 SDValue DAGCombiner::visitMSTORE(SDNode *N) {
10649 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
10650 SDValue Mask = MST->getMask();
10651 SDValue Chain = MST->getChain();
10652 SDValue Value = MST->getValue();
10653 SDValue Ptr = MST->getBasePtr();
10656 // Zap masked stores with a zero mask.
10657 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10660 // If this is a masked load with an all ones mask, we can use a unmasked load.
10661 // FIXME: Can we do this for indexed, compressing, or truncating stores?
10662 if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() &&
10663 !MST->isCompressingStore() && !MST->isTruncatingStore())
10664 return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(),
10665 MST->getBasePtr(), MST->getPointerInfo(),
10666 MST->getOriginalAlign(), MachineMemOperand::MOStore,
10669 // Try transforming N to an indexed store.
10670 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
10671 return SDValue(N, 0);
10673 if (MST->isTruncatingStore() && MST->isUnindexed() &&
10674 Value.getValueType().isInteger() &&
10675 (!isa<ConstantSDNode>(Value) ||
10676 !cast<ConstantSDNode>(Value)->isOpaque())) {
10677 APInt TruncDemandedBits =
10678 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
10679 MST->getMemoryVT().getScalarSizeInBits());
10681 // See if we can simplify the operation with
10682 // SimplifyDemandedBits, which only works if the value has a single use.
10683 if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
10684 // Re-visit the store if anything changed and the store hasn't been merged
10685 // with another node (N is deleted) SimplifyDemandedBits will add Value's
10686 // node back to the worklist if necessary, but we also need to re-visit
10687 // the Store node itself.
10688 if (N->getOpcode() != ISD::DELETED_NODE)
10690 return SDValue(N, 0);
10694 // If this is a TRUNC followed by a masked store, fold this into a masked
10695 // truncating store. We can do this even if this is already a masked
10697 if ((Value.getOpcode() == ISD::TRUNCATE) && Value->hasOneUse() &&
10698 MST->isUnindexed() &&
10699 TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
10700 MST->getMemoryVT(), LegalOperations)) {
10701 auto Mask = TLI.promoteTargetBoolean(DAG, MST->getMask(),
10702 Value.getOperand(0).getValueType());
10703 return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
10704 MST->getOffset(), Mask, MST->getMemoryVT(),
10705 MST->getMemOperand(), MST->getAddressingMode(),
10706 /*IsTruncating=*/true);
10712 SDValue DAGCombiner::visitMGATHER(SDNode *N) {
10713 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
10714 SDValue Mask = MGT->getMask();
10715 SDValue Chain = MGT->getChain();
10716 SDValue Index = MGT->getIndex();
10717 SDValue Scale = MGT->getScale();
10718 SDValue PassThru = MGT->getPassThru();
10719 SDValue BasePtr = MGT->getBasePtr();
10720 ISD::MemIndexType IndexType = MGT->getIndexType();
10723 // Zap gathers with a zero mask.
10724 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10725 return CombineTo(N, PassThru, MGT->getChain());
10727 if (refineUniformBase(BasePtr, Index, MGT->isIndexScaled(), DAG)) {
10728 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
10729 return DAG.getMaskedGather(
10730 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
10731 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
10734 if (refineIndexType(Index, IndexType, N->getValueType(0), DAG)) {
10735 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
10736 return DAG.getMaskedGather(
10737 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
10738 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
10744 SDValue DAGCombiner::visitMLOAD(SDNode *N) {
10745 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
10746 SDValue Mask = MLD->getMask();
10749 // Zap masked loads with a zero mask.
10750 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10751 return CombineTo(N, MLD->getPassThru(), MLD->getChain());
10753 // If this is a masked load with an all ones mask, we can use a unmasked load.
10754 // FIXME: Can we do this for indexed, expanding, or extending loads?
10755 if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MLD->isUnindexed() &&
10756 !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) {
10757 SDValue NewLd = DAG.getLoad(
10758 N->getValueType(0), SDLoc(N), MLD->getChain(), MLD->getBasePtr(),
10759 MLD->getPointerInfo(), MLD->getOriginalAlign(),
10760 MachineMemOperand::MOLoad, MLD->getAAInfo(), MLD->getRanges());
10761 return CombineTo(N, NewLd, NewLd.getValue(1));
10764 // Try transforming N to an indexed load.
10765 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
10766 return SDValue(N, 0);
10771 /// A vector select of 2 constant vectors can be simplified to math/logic to
10772 /// avoid a variable select instruction and possibly avoid constant loads.
10773 SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
10774 SDValue Cond = N->getOperand(0);
10775 SDValue N1 = N->getOperand(1);
10776 SDValue N2 = N->getOperand(2);
10777 EVT VT = N->getValueType(0);
10778 if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 ||
10779 !TLI.convertSelectOfConstantsToMath(VT) ||
10780 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) ||
10781 !ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
10784 // Check if we can use the condition value to increment/decrement a single
10785 // constant value. This simplifies a select to an add and removes a constant
10786 // load/materialization from the general case.
10787 bool AllAddOne = true;
10788 bool AllSubOne = true;
10789 unsigned Elts = VT.getVectorNumElements();
10790 for (unsigned i = 0; i != Elts; ++i) {
10791 SDValue N1Elt = N1.getOperand(i);
10792 SDValue N2Elt = N2.getOperand(i);
10793 if (N1Elt.isUndef() || N2Elt.isUndef())
10795 if (N1Elt.getValueType() != N2Elt.getValueType())
10798 const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
10799 const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
10806 // Further simplifications for the extra-special cases where the constants are
10807 // all 0 or all -1 should be implemented as folds of these patterns.
10809 if (AllAddOne || AllSubOne) {
10810 // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
10811 // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
10812 auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
10813 SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
10814 return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
10817 // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C)
10819 if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() &&
10820 isNullOrNullSplat(N2)) {
10821 SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT);
10822 SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT);
10823 return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
10826 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
10829 // The general case for select-of-constants:
10830 // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
10831 // ...but that only makes sense if a vselect is slower than 2 logic ops, so
10832 // leave that to a machine-specific pass.
10836 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
10837 SDValue N0 = N->getOperand(0);
10838 SDValue N1 = N->getOperand(1);
10839 SDValue N2 = N->getOperand(2);
10840 EVT VT = N->getValueType(0);
10843 if (SDValue V = DAG.simplifySelect(N0, N1, N2))
10846 if (SDValue V = foldBoolSelectToLogic(N, DAG))
10849 // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
10850 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
10851 return DAG.getSelect(DL, VT, F, N2, N1);
10853 // Canonicalize integer abs.
10854 // vselect (setg[te] X, 0), X, -X ->
10855 // vselect (setgt X, -1), X, -X ->
10856 // vselect (setl[te] X, 0), -X, X ->
10857 // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
10858 if (N0.getOpcode() == ISD::SETCC) {
10859 SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
10860 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
10861 bool isAbs = false;
10862 bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
10864 if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
10865 (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
10866 N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
10867 isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
10868 else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) &&
10869 N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
10870 isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
10873 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
10874 return DAG.getNode(ISD::ABS, DL, VT, LHS);
10876 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
10877 DAG.getConstant(VT.getScalarSizeInBits() - 1,
10878 DL, getShiftAmountTy(VT)));
10879 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
10880 AddToWorklist(Shift.getNode());
10881 AddToWorklist(Add.getNode());
10882 return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
10885 // vselect x, y (fcmp lt x, y) -> fminnum x, y
10886 // vselect x, y (fcmp gt x, y) -> fmaxnum x, y
10888 // This is OK if we don't care about what happens if either operand is a
10891 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
10892 if (SDValue FMinMax =
10893 combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG))
10897 if (SDValue S = PerformMinMaxFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
10899 if (SDValue S = PerformUMinFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
10902 // If this select has a condition (setcc) with narrower operands than the
10903 // select, try to widen the compare to match the select width.
10904 // TODO: This should be extended to handle any constant.
10905 // TODO: This could be extended to handle non-loading patterns, but that
10906 // requires thorough testing to avoid regressions.
10907 if (isNullOrNullSplat(RHS)) {
10908 EVT NarrowVT = LHS.getValueType();
10909 EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
10910 EVT SetCCVT = getSetCCResultType(LHS.getValueType());
10911 unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
10912 unsigned WideWidth = WideVT.getScalarSizeInBits();
10913 bool IsSigned = isSignedIntSetCC(CC);
10914 auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
10915 if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
10916 SetCCWidth != 1 && SetCCWidth < WideWidth &&
10917 TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
10918 TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
10919 // Both compare operands can be widened for free. The LHS can use an
10920 // extended load, and the RHS is a constant:
10921 // vselect (ext (setcc load(X), C)), N1, N2 -->
10922 // vselect (setcc extload(X), C'), N1, N2
10923 auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
10924 SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
10925 SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
10926 EVT WideSetCCVT = getSetCCResultType(WideVT);
10927 SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
10928 return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
10932 // Match VSELECTs into add with unsigned saturation.
10933 if (hasOperation(ISD::UADDSAT, VT)) {
10934 // Check if one of the arms of the VSELECT is vector with all bits set.
10935 // If it's on the left side invert the predicate to simplify logic below.
10937 ISD::CondCode SatCC = CC;
10938 if (ISD::isConstantSplatVectorAllOnes(N1.getNode())) {
10940 SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
10941 } else if (ISD::isConstantSplatVectorAllOnes(N2.getNode())) {
10945 if (Other && Other.getOpcode() == ISD::ADD) {
10946 SDValue CondLHS = LHS, CondRHS = RHS;
10947 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
10949 // Canonicalize condition operands.
10950 if (SatCC == ISD::SETUGE) {
10951 std::swap(CondLHS, CondRHS);
10952 SatCC = ISD::SETULE;
10955 // We can test against either of the addition operands.
10956 // x <= x+y ? x+y : ~0 --> uaddsat x, y
10957 // x+y >= x ? x+y : ~0 --> uaddsat x, y
10958 if (SatCC == ISD::SETULE && Other == CondRHS &&
10959 (OpLHS == CondLHS || OpRHS == CondLHS))
10960 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
10962 if (OpRHS.getOpcode() == CondRHS.getOpcode() &&
10963 (OpRHS.getOpcode() == ISD::BUILD_VECTOR ||
10964 OpRHS.getOpcode() == ISD::SPLAT_VECTOR) &&
10965 CondLHS == OpLHS) {
10966 // If the RHS is a constant we have to reverse the const
10967 // canonicalization.
10968 // x >= ~C ? x+C : ~0 --> uaddsat x, C
10969 auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
10970 return Cond->getAPIntValue() == ~Op->getAPIntValue();
10972 if (SatCC == ISD::SETULE &&
10973 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
10974 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
10979 // Match VSELECTs into sub with unsigned saturation.
10980 if (hasOperation(ISD::USUBSAT, VT)) {
10981 // Check if one of the arms of the VSELECT is a zero vector. If it's on
10982 // the left side invert the predicate to simplify logic below.
10984 ISD::CondCode SatCC = CC;
10985 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
10987 SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
10988 } else if (ISD::isConstantSplatVectorAllZeros(N2.getNode())) {
10992 // zext(x) >= y ? trunc(zext(x) - y) : 0
10993 // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
10994 // zext(x) > y ? trunc(zext(x) - y) : 0
10995 // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
10996 if (Other && Other.getOpcode() == ISD::TRUNCATE &&
10997 Other.getOperand(0).getOpcode() == ISD::SUB &&
10998 (SatCC == ISD::SETUGE || SatCC == ISD::SETUGT)) {
10999 SDValue OpLHS = Other.getOperand(0).getOperand(0);
11000 SDValue OpRHS = Other.getOperand(0).getOperand(1);
11001 if (LHS == OpLHS && RHS == OpRHS && LHS.getOpcode() == ISD::ZERO_EXTEND)
11002 if (SDValue R = getTruncatedUSUBSAT(VT, LHS.getValueType(), LHS, RHS,
11007 if (Other && Other.getNumOperands() == 2) {
11008 SDValue CondRHS = RHS;
11009 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
11011 if (OpLHS == LHS) {
11012 // Look for a general sub with unsigned saturation first.
11013 // x >= y ? x-y : 0 --> usubsat x, y
11014 // x > y ? x-y : 0 --> usubsat x, y
11015 if ((SatCC == ISD::SETUGE || SatCC == ISD::SETUGT) &&
11016 Other.getOpcode() == ISD::SUB && OpRHS == CondRHS)
11017 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
11019 if (OpRHS.getOpcode() == ISD::BUILD_VECTOR ||
11020 OpRHS.getOpcode() == ISD::SPLAT_VECTOR) {
11021 if (CondRHS.getOpcode() == ISD::BUILD_VECTOR ||
11022 CondRHS.getOpcode() == ISD::SPLAT_VECTOR) {
11023 // If the RHS is a constant we have to reverse the const
11024 // canonicalization.
11025 // x > C-1 ? x+-C : 0 --> usubsat x, C
11026 auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
11027 return (!Op && !Cond) ||
11029 Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
11031 if (SatCC == ISD::SETUGT && Other.getOpcode() == ISD::ADD &&
11032 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
11033 /*AllowUndefs*/ true)) {
11034 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
11035 DAG.getConstant(0, DL, VT), OpRHS);
11036 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
11039 // Another special case: If C was a sign bit, the sub has been
11040 // canonicalized into a xor.
11041 // FIXME: Would it be better to use computeKnownBits to
11042 // determine whether it's safe to decanonicalize the xor?
11043 // x s< 0 ? x^C : 0 --> usubsat x, C
11045 if (SatCC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
11046 ISD::isConstantSplatVector(OpRHS.getNode(), SplatValue) &&
11047 ISD::isConstantSplatVectorAllZeros(CondRHS.getNode()) &&
11048 SplatValue.isSignMask()) {
11049 // Note that we have to rebuild the RHS constant here to
11050 // ensure we don't rely on particular values of undef lanes.
11051 OpRHS = DAG.getConstant(SplatValue, DL, VT);
11052 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
11061 if (SimplifySelectOps(N, N1, N2))
11062 return SDValue(N, 0); // Don't revisit N.
11064 // Fold (vselect all_ones, N1, N2) -> N1
11065 if (ISD::isConstantSplatVectorAllOnes(N0.getNode()))
11067 // Fold (vselect all_zeros, N1, N2) -> N2
11068 if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
11071 // The ConvertSelectToConcatVector function is assuming both the above
11072 // checks for (vselect (build_vector all{ones,zeros) ...) have been made
11074 if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
11075 N2.getOpcode() == ISD::CONCAT_VECTORS &&
11076 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
11077 if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
11081 if (SDValue V = foldVSelectOfConstants(N))
11084 if (hasOperation(ISD::SRA, VT))
11085 if (SDValue V = foldVSelectToSignBitSplatMask(N, DAG))
11088 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
11089 return SDValue(N, 0);
11094 SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
11095 SDValue N0 = N->getOperand(0);
11096 SDValue N1 = N->getOperand(1);
11097 SDValue N2 = N->getOperand(2);
11098 SDValue N3 = N->getOperand(3);
11099 SDValue N4 = N->getOperand(4);
11100 ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
11102 // fold select_cc lhs, rhs, x, x, cc -> x
11106 // Determine if the condition we're dealing with is constant
11107 if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
11108 CC, SDLoc(N), false)) {
11109 AddToWorklist(SCC.getNode());
11111 // cond always true -> true val
11112 // cond always false -> false val
11113 if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode()))
11114 return SCCC->isZero() ? N3 : N2;
11116 // When the condition is UNDEF, just return the first operand. This is
11117 // coherent the DAG creation, no setcc node is created in this case
11118 if (SCC->isUndef())
11121 // Fold to a simpler select_cc
11122 if (SCC.getOpcode() == ISD::SETCC) {
11123 SDValue SelectOp = DAG.getNode(
11124 ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0),
11125 SCC.getOperand(1), N2, N3, SCC.getOperand(2));
11126 SelectOp->setFlags(SCC->getFlags());
11131 // If we can fold this based on the true/false value, do so.
11132 if (SimplifySelectOps(N, N2, N3))
11133 return SDValue(N, 0); // Don't revisit N.
11135 // fold select_cc into other things, such as min/max/abs
11136 return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
11139 SDValue DAGCombiner::visitSETCC(SDNode *N) {
11140 // setcc is very commonly used as an argument to brcond. This pattern
11141 // also lend itself to numerous combines and, as a result, it is desired
11142 // we keep the argument to a brcond as a setcc as much as possible.
11144 N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;
11146 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11147 EVT VT = N->getValueType(0);
11149 // SETCC(FREEZE(X), CONST, Cond)
11151 // FREEZE(SETCC(X, CONST, Cond))
11152 // This is correct if FREEZE(X) has one use and SETCC(FREEZE(X), CONST, Cond)
11153 // isn't equivalent to true or false.
11154 // For example, SETCC(FREEZE(X), -128, SETULT) cannot be folded to
11155 // FREEZE(SETCC(X, -128, SETULT)) because X can be poison.
11157 // This transformation is beneficial because visitBRCOND can fold
11158 // BRCOND(FREEZE(X)) to BRCOND(X).
11160 // Conservatively optimize integer comparisons only.
11162 // Do this only when SETCC is going to be used by BRCOND.
11164 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
11165 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
11166 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
11167 bool Updated = false;
11169 // Is 'X Cond C' always true or false?
11170 auto IsAlwaysTrueOrFalse = [](ISD::CondCode Cond, ConstantSDNode *C) {
11171 bool False = (Cond == ISD::SETULT && C->isZero()) ||
11172 (Cond == ISD::SETLT && C->isMinSignedValue()) ||
11173 (Cond == ISD::SETUGT && C->isAllOnes()) ||
11174 (Cond == ISD::SETGT && C->isMaxSignedValue());
11175 bool True = (Cond == ISD::SETULE && C->isAllOnes()) ||
11176 (Cond == ISD::SETLE && C->isMaxSignedValue()) ||
11177 (Cond == ISD::SETUGE && C->isZero()) ||
11178 (Cond == ISD::SETGE && C->isMinSignedValue());
11179 return True || False;
11182 if (N0->getOpcode() == ISD::FREEZE && N0.hasOneUse() && N1C) {
11183 if (!IsAlwaysTrueOrFalse(Cond, N1C)) {
11184 N0 = N0->getOperand(0);
11188 if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse() && N0C) {
11189 if (!IsAlwaysTrueOrFalse(ISD::getSetCCSwappedOperands(Cond),
11191 N1 = N1->getOperand(0);
11197 return DAG.getFreeze(DAG.getSetCC(SDLoc(N), VT, N0, N1, Cond));
11200 SDValue Combined = SimplifySetCC(VT, N->getOperand(0), N->getOperand(1), Cond,
11201 SDLoc(N), !PreferSetCC);
11206 // If we prefer to have a setcc, and we don't, we'll try our best to
11207 // recreate one using rebuildSetCC.
11208 if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
11209 SDValue NewSetCC = rebuildSetCC(Combined);
11211 // We don't have anything interesting to combine to.
11212 if (NewSetCC.getNode() == N)
11222 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
11223 SDValue LHS = N->getOperand(0);
11224 SDValue RHS = N->getOperand(1);
11225 SDValue Carry = N->getOperand(2);
11226 SDValue Cond = N->getOperand(3);
11228 // If Carry is false, fold to a regular SETCC.
11229 if (isNullConstant(Carry))
11230 return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
11235 /// Check if N satisfies:
11236 /// N is used once.
11238 /// The load is compatible with ExtOpcode. It means
11239 /// If load has explicit zero/sign extension, ExpOpcode must have the same
11241 /// Otherwise returns true.
11242 static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) {
11243 if (!N.hasOneUse())
11246 if (!isa<LoadSDNode>(N))
11249 LoadSDNode *Load = cast<LoadSDNode>(N);
11250 ISD::LoadExtType LoadExt = Load->getExtensionType();
11251 if (LoadExt == ISD::NON_EXTLOAD || LoadExt == ISD::EXTLOAD)
11254 // Now LoadExt is either SEXTLOAD or ZEXTLOAD, ExtOpcode must have the same
11256 if ((LoadExt == ISD::SEXTLOAD && ExtOpcode != ISD::SIGN_EXTEND) ||
11257 (LoadExt == ISD::ZEXTLOAD && ExtOpcode != ISD::ZERO_EXTEND))
11264 /// (sext (select c, load x, load y)) -> (select c, sextload x, sextload y)
11265 /// (zext (select c, load x, load y)) -> (select c, zextload x, zextload y)
11266 /// (aext (select c, load x, load y)) -> (select c, extload x, extload y)
11267 /// This function is called by the DAGCombiner when visiting sext/zext/aext
11268 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
11269 static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
11270 SelectionDAG &DAG) {
11271 unsigned Opcode = N->getOpcode();
11272 SDValue N0 = N->getOperand(0);
11273 EVT VT = N->getValueType(0);
11276 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
11277 Opcode == ISD::ANY_EXTEND) &&
11278 "Expected EXTEND dag node in input!");
11280 if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) ||
11284 SDValue Op1 = N0->getOperand(1);
11285 SDValue Op2 = N0->getOperand(2);
11286 if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode))
11289 auto ExtLoadOpcode = ISD::EXTLOAD;
11290 if (Opcode == ISD::SIGN_EXTEND)
11291 ExtLoadOpcode = ISD::SEXTLOAD;
11292 else if (Opcode == ISD::ZERO_EXTEND)
11293 ExtLoadOpcode = ISD::ZEXTLOAD;
11295 LoadSDNode *Load1 = cast<LoadSDNode>(Op1);
11296 LoadSDNode *Load2 = cast<LoadSDNode>(Op2);
11297 if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) ||
11298 !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()))
11301 SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1);
11302 SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2);
11303 return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2);
11306 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
11307 /// a build_vector of constants.
11308 /// This function is called by the DAGCombiner when visiting sext/zext/aext
11309 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
11310 /// Vector extends are not folded if operations are legal; this is to
11311 /// avoid introducing illegal build_vector dag nodes.
11312 static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
11313 SelectionDAG &DAG, bool LegalTypes) {
11314 unsigned Opcode = N->getOpcode();
11315 SDValue N0 = N->getOperand(0);
11316 EVT VT = N->getValueType(0);
11319 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
11320 Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
11321 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
11322 && "Expected EXTEND dag node in input!");
11324 // fold (sext c1) -> c1
11325 // fold (zext c1) -> c1
11326 // fold (aext c1) -> c1
11327 if (isa<ConstantSDNode>(N0))
11328 return DAG.getNode(Opcode, DL, VT, N0);
11330 // fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
11331 // fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2)
11332 // fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
11333 if (N0->getOpcode() == ISD::SELECT) {
11334 SDValue Op1 = N0->getOperand(1);
11335 SDValue Op2 = N0->getOperand(2);
11336 if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) &&
11337 (Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) {
11338 // For any_extend, choose sign extension of the constants to allow a
11339 // possible further transform to sign_extend_inreg.i.e.
11341 // t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0>
11342 // t2: i64 = any_extend t1
11344 // t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0>
11346 // t4: i64 = sign_extend_inreg t3
11347 unsigned FoldOpc = Opcode;
11348 if (FoldOpc == ISD::ANY_EXTEND)
11349 FoldOpc = ISD::SIGN_EXTEND;
11350 return DAG.getSelect(DL, VT, N0->getOperand(0),
11351 DAG.getNode(FoldOpc, DL, VT, Op1),
11352 DAG.getNode(FoldOpc, DL, VT, Op2));
11356 // fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
11357 // fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
11358 // fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
11359 EVT SVT = VT.getScalarType();
11360 if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) &&
11361 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
11364 // We can fold this node into a build_vector.
11365 unsigned VTBits = SVT.getSizeInBits();
11366 unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
11367 SmallVector<SDValue, 8> Elts;
11368 unsigned NumElts = VT.getVectorNumElements();
11370 // For zero-extensions, UNDEF elements still guarantee to have the upper
11371 // bits set to zero.
11373 Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ZERO_EXTEND_VECTOR_INREG;
11375 for (unsigned i = 0; i != NumElts; ++i) {
11376 SDValue Op = N0.getOperand(i);
11377 if (Op.isUndef()) {
11378 Elts.push_back(IsZext ? DAG.getConstant(0, DL, SVT) : DAG.getUNDEF(SVT));
11383 // Get the constant value and if needed trunc it to the size of the type.
11384 // Nodes like build_vector might have constants wider than the scalar type.
11385 APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
11386 if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
11387 Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
11389 Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
11392 return DAG.getBuildVector(VT, DL, Elts);
11395 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
11396 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
11397 // transformation. Returns true if extension are possible and the above
11398 // mentioned transformation is profitable.
11399 static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
11401 SmallVectorImpl<SDNode *> &ExtendNodes,
11402 const TargetLowering &TLI) {
11403 bool HasCopyToRegUses = false;
11404 bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
11405 for (SDNode::use_iterator UI = N0->use_begin(), UE = N0->use_end(); UI != UE;
11407 SDNode *User = *UI;
11410 if (UI.getUse().getResNo() != N0.getResNo())
11412 // FIXME: Only extend SETCC N, N and SETCC N, c for now.
11413 if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
11414 ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
11415 if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
11416 // Sign bits will be lost after a zext.
11419 for (unsigned i = 0; i != 2; ++i) {
11420 SDValue UseOp = User->getOperand(i);
11423 if (!isa<ConstantSDNode>(UseOp))
11428 ExtendNodes.push_back(User);
11431 // If truncates aren't free and there are users we can't
11432 // extend, it isn't worthwhile.
11435 // Remember if this value is live-out.
11436 if (User->getOpcode() == ISD::CopyToReg)
11437 HasCopyToRegUses = true;
11440 if (HasCopyToRegUses) {
11441 bool BothLiveOut = false;
11442 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
11444 SDUse &Use = UI.getUse();
11445 if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
11446 BothLiveOut = true;
11451 // Both unextended and extended values are live out. There had better be
11452 // a good reason for the transformation.
11453 return ExtendNodes.size();
11458 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
11459 SDValue OrigLoad, SDValue ExtLoad,
11460 ISD::NodeType ExtType) {
11461 // Extend SetCC uses if necessary.
11463 for (SDNode *SetCC : SetCCs) {
11464 SmallVector<SDValue, 4> Ops;
11466 for (unsigned j = 0; j != 2; ++j) {
11467 SDValue SOp = SetCC->getOperand(j);
11468 if (SOp == OrigLoad)
11469 Ops.push_back(ExtLoad);
11471 Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
11474 Ops.push_back(SetCC->getOperand(2));
11475 CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
11479 // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
11480 SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
11481 SDValue N0 = N->getOperand(0);
11482 EVT DstVT = N->getValueType(0);
11483 EVT SrcVT = N0.getValueType();
11485 assert((N->getOpcode() == ISD::SIGN_EXTEND ||
11486 N->getOpcode() == ISD::ZERO_EXTEND) &&
11487 "Unexpected node type (not an extend)!");
11489 // fold (sext (load x)) to multiple smaller sextloads; same for zext.
11490 // For example, on a target with legal v4i32, but illegal v8i32, turn:
11491 // (v8i32 (sext (v8i16 (load x))))
11493 // (v8i32 (concat_vectors (v4i32 (sextload x)),
11494 // (v4i32 (sextload (x + 16)))))
11495 // Where uses of the original load, i.e.:
11496 // (v8i16 (load x))
11497 // are replaced with:
11498 // (v8i16 (truncate
11499 // (v8i32 (concat_vectors (v4i32 (sextload x)),
11500 // (v4i32 (sextload (x + 16)))))))
11502 // This combine is only applicable to illegal, but splittable, vectors.
11503 // All legal types, and illegal non-vector types, are handled elsewhere.
11504 // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
11506 if (N0->getOpcode() != ISD::LOAD)
11509 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
11511 if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
11512 !N0.hasOneUse() || !LN0->isSimple() ||
11513 !DstVT.isVector() || !DstVT.isPow2VectorType() ||
11514 !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
11517 SmallVector<SDNode *, 4> SetCCs;
11518 if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI))
11521 ISD::LoadExtType ExtType =
11522 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
11524 // Try to split the vector types to get down to legal types.
11525 EVT SplitSrcVT = SrcVT;
11526 EVT SplitDstVT = DstVT;
11527 while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
11528 SplitSrcVT.getVectorNumElements() > 1) {
11529 SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
11530 SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
11533 if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
11536 assert(!DstVT.isScalableVector() && "Unexpected scalable vector type");
11539 const unsigned NumSplits =
11540 DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
11541 const unsigned Stride = SplitSrcVT.getStoreSize();
11542 SmallVector<SDValue, 4> Loads;
11543 SmallVector<SDValue, 4> Chains;
11545 SDValue BasePtr = LN0->getBasePtr();
11546 for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
11547 const unsigned Offset = Idx * Stride;
11548 const Align Align = commonAlignment(LN0->getAlign(), Offset);
11550 SDValue SplitLoad = DAG.getExtLoad(
11551 ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
11552 LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
11553 LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
11555 BasePtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(Stride), DL);
11557 Loads.push_back(SplitLoad.getValue(0));
11558 Chains.push_back(SplitLoad.getValue(1));
11561 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
11562 SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
11565 AddToWorklist(NewChain.getNode());
11567 CombineTo(N, NewValue);
11569 // Replace uses of the original load (before extension)
11570 // with a truncate of the concatenated sextloaded vectors.
11572 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
11573 ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode());
11574 CombineTo(N0.getNode(), Trunc, NewChain);
11575 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11578 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
11579 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
11580 SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
11581 assert(N->getOpcode() == ISD::ZERO_EXTEND);
11582 EVT VT = N->getValueType(0);
11583 EVT OrigVT = N->getOperand(0).getValueType();
11584 if (TLI.isZExtFree(OrigVT, VT))
11588 SDValue N0 = N->getOperand(0);
11589 if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
11590 N0.getOpcode() == ISD::XOR) ||
11591 N0.getOperand(1).getOpcode() != ISD::Constant ||
11592 (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
11596 SDValue N1 = N0->getOperand(0);
11597 if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
11598 N1.getOperand(1).getOpcode() != ISD::Constant ||
11599 (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
11603 if (!isa<LoadSDNode>(N1.getOperand(0)))
11605 LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
11606 EVT MemVT = Load->getMemoryVT();
11607 if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
11608 Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
11612 // If the shift op is SHL, the logic op must be AND, otherwise the result
11614 if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
11617 if (!N0.hasOneUse() || !N1.hasOneUse())
11620 SmallVector<SDNode*, 4> SetCCs;
11621 if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
11622 ISD::ZERO_EXTEND, SetCCs, TLI))
11625 // Actually do the transformation.
11626 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
11627 Load->getChain(), Load->getBasePtr(),
11628 Load->getMemoryVT(), Load->getMemOperand());
11631 SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
11634 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
11636 SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
11637 DAG.getConstant(Mask, DL0, VT));
11639 ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
11641 if (SDValue(Load, 0).hasOneUse()) {
11642 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
11644 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
11645 Load->getValueType(0), ExtLoad);
11646 CombineTo(Load, Trunc, ExtLoad.getValue(1));
11649 // N0 is dead at this point.
11650 recursivelyDeleteUnusedNodes(N0.getNode());
11652 return SDValue(N,0); // Return N so it doesn't get rechecked!
11655 /// If we're narrowing or widening the result of a vector select and the final
11656 /// size is the same size as a setcc (compare) feeding the select, then try to
11657 /// apply the cast operation to the select's operands because matching vector
11658 /// sizes for a select condition and other operands should be more efficient.
11659 SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
11660 unsigned CastOpcode = Cast->getOpcode();
11661 assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
11662 CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
11663 CastOpcode == ISD::FP_ROUND) &&
11664 "Unexpected opcode for vector select narrowing/widening");
11666 // We only do this transform before legal ops because the pattern may be
11667 // obfuscated by target-specific operations after legalization. Do not create
11668 // an illegal select op, however, because that may be difficult to lower.
11669 EVT VT = Cast->getValueType(0);
11670 if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
11673 SDValue VSel = Cast->getOperand(0);
11674 if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
11675 VSel.getOperand(0).getOpcode() != ISD::SETCC)
11678 // Does the setcc have the same vector size as the casted select?
11679 SDValue SetCC = VSel.getOperand(0);
11680 EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
11681 if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
11684 // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
11685 SDValue A = VSel.getOperand(1);
11686 SDValue B = VSel.getOperand(2);
11687 SDValue CastA, CastB;
11689 if (CastOpcode == ISD::FP_ROUND) {
11690 // FP_ROUND (fptrunc) has an extra flag operand to pass along.
11691 CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
11692 CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
11694 CastA = DAG.getNode(CastOpcode, DL, VT, A);
11695 CastB = DAG.getNode(CastOpcode, DL, VT, B);
11697 return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
11700 // fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
11701 // fold ([s|z]ext ( extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
11702 static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
11703 const TargetLowering &TLI, EVT VT,
11704 bool LegalOperations, SDNode *N,
11705 SDValue N0, ISD::LoadExtType ExtLoadType) {
11706 SDNode *N0Node = N0.getNode();
11707 bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node)
11708 : ISD::isZEXTLoad(N0Node);
11709 if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) ||
11710 !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse())
11713 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
11714 EVT MemVT = LN0->getMemoryVT();
11715 if ((LegalOperations || !LN0->isSimple() ||
11717 !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
11721 DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
11722 LN0->getBasePtr(), MemVT, LN0->getMemOperand());
11723 Combiner.CombineTo(N, ExtLoad);
11724 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
11725 if (LN0->use_empty())
11726 Combiner.recursivelyDeleteUnusedNodes(LN0);
11727 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11730 // fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x)))
11731 // Only generate vector extloads when 1) they're legal, and 2) they are
11732 // deemed desirable by the target.
11733 static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
11734 const TargetLowering &TLI, EVT VT,
11735 bool LegalOperations, SDNode *N, SDValue N0,
11736 ISD::LoadExtType ExtLoadType,
11737 ISD::NodeType ExtOpc) {
11738 // TODO: isFixedLengthVector() should be removed and any negative effects on
11739 // code generation being the result of that target's implementation of
11740 // isVectorLoadExtDesirable().
11741 if (!ISD::isNON_EXTLoad(N0.getNode()) ||
11742 !ISD::isUNINDEXEDLoad(N0.getNode()) ||
11743 ((LegalOperations || VT.isFixedLengthVector() ||
11744 !cast<LoadSDNode>(N0)->isSimple()) &&
11745 !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
11748 bool DoXform = true;
11749 SmallVector<SDNode *, 4> SetCCs;
11750 if (!N0.hasOneUse())
11751 DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI);
11753 DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
11757 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
11758 SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
11759 LN0->getBasePtr(), N0.getValueType(),
11760 LN0->getMemOperand());
11761 Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc);
11762 // If the load value is used only by N, replace it via CombineTo N.
11763 bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
11764 Combiner.CombineTo(N, ExtLoad);
11765 if (NoReplaceTrunc) {
11766 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
11767 Combiner.recursivelyDeleteUnusedNodes(LN0);
11770 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
11771 Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1));
11773 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11776 static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
11777 const TargetLowering &TLI, EVT VT,
11778 SDNode *N, SDValue N0,
11779 ISD::LoadExtType ExtLoadType,
11780 ISD::NodeType ExtOpc) {
11781 if (!N0.hasOneUse())
11784 MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
11785 if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
11788 if (!TLI.isLoadExtLegalOrCustom(ExtLoadType, VT, Ld->getValueType(0)))
11791 if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
11795 SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
11796 SDValue NewLoad = DAG.getMaskedLoad(
11797 VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(),
11798 PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(),
11799 ExtLoadType, Ld->isExpandingLoad());
11800 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
11804 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
11805 bool LegalOperations) {
11806 assert((N->getOpcode() == ISD::SIGN_EXTEND ||
11807 N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext");
11809 SDValue SetCC = N->getOperand(0);
11810 if (LegalOperations || SetCC.getOpcode() != ISD::SETCC ||
11811 !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1)
11814 SDValue X = SetCC.getOperand(0);
11815 SDValue Ones = SetCC.getOperand(1);
11816 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
11817 EVT VT = N->getValueType(0);
11818 EVT XVT = X.getValueType();
11819 // setge X, C is canonicalized to setgt, so we do not need to match that
11820 // pattern. The setlt sibling is folded in SimplifySelectCC() because it does
11821 // not require the 'not' op.
11822 if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) {
11823 // Invert and smear/shift the sign bit:
11824 // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1)
11825 // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1)
11827 unsigned ShCt = VT.getSizeInBits() - 1;
11828 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11829 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
11830 SDValue NotX = DAG.getNOT(DL, X, VT);
11831 SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT);
11833 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL;
11834 return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount);
11840 SDValue DAGCombiner::foldSextSetcc(SDNode *N) {
11841 SDValue N0 = N->getOperand(0);
11842 if (N0.getOpcode() != ISD::SETCC)
11845 SDValue N00 = N0.getOperand(0);
11846 SDValue N01 = N0.getOperand(1);
11847 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
11848 EVT VT = N->getValueType(0);
11849 EVT N00VT = N00.getValueType();
11852 // Propagate fast-math-flags.
11853 SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
11855 // On some architectures (such as SSE/NEON/etc) the SETCC result type is
11856 // the same size as the compared operands. Try to optimize sext(setcc())
11857 // if this is the case.
11858 if (VT.isVector() && !LegalOperations &&
11859 TLI.getBooleanContents(N00VT) ==
11860 TargetLowering::ZeroOrNegativeOneBooleanContent) {
11861 EVT SVT = getSetCCResultType(N00VT);
11863 // If we already have the desired type, don't change it.
11864 if (SVT != N0.getValueType()) {
11865 // We know that the # elements of the results is the same as the
11866 // # elements of the compare (and the # elements of the compare result
11867 // for that matter). Check to see that they are the same size. If so,
11868 // we know that the element size of the sext'd result matches the
11869 // element size of the compare operands.
11870 if (VT.getSizeInBits() == SVT.getSizeInBits())
11871 return DAG.getSetCC(DL, VT, N00, N01, CC);
11873 // If the desired elements are smaller or larger than the source
11874 // elements, we can use a matching integer vector type and then
11875 // truncate/sign extend.
11876 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
11877 if (SVT == MatchingVecType) {
11878 SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
11879 return DAG.getSExtOrTrunc(VsetCC, DL, VT);
11883 // Try to eliminate the sext of a setcc by zexting the compare operands.
11884 if (N0.hasOneUse() && TLI.isOperationLegalOrCustom(ISD::SETCC, VT) &&
11885 !TLI.isOperationLegalOrCustom(ISD::SETCC, SVT)) {
11886 bool IsSignedCmp = ISD::isSignedIntSetCC(CC);
11887 unsigned LoadOpcode = IsSignedCmp ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
11888 unsigned ExtOpcode = IsSignedCmp ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
11890 // We have an unsupported narrow vector compare op that would be legal
11891 // if extended to the destination type. See if the compare operands
11892 // can be freely extended to the destination type.
11893 auto IsFreeToExtend = [&](SDValue V) {
11894 if (isConstantOrConstantVector(V, /*NoOpaques*/ true))
11896 // Match a simple, non-extended load that can be converted to a
11897 // legal {z/s}ext-load.
11898 // TODO: Allow widening of an existing {z/s}ext-load?
11899 if (!(ISD::isNON_EXTLoad(V.getNode()) &&
11900 ISD::isUNINDEXEDLoad(V.getNode()) &&
11901 cast<LoadSDNode>(V)->isSimple() &&
11902 TLI.isLoadExtLegal(LoadOpcode, VT, V.getValueType())))
11905 // Non-chain users of this value must either be the setcc in this
11906 // sequence or extends that can be folded into the new {z/s}ext-load.
11907 for (SDNode::use_iterator UI = V->use_begin(), UE = V->use_end();
11909 // Skip uses of the chain and the setcc.
11910 SDNode *User = *UI;
11911 if (UI.getUse().getResNo() != 0 || User == N0.getNode())
11913 // Extra users must have exactly the same cast we are about to create.
11914 // TODO: This restriction could be eased if ExtendUsesToFormExtLoad()
11915 // is enhanced similarly.
11916 if (User->getOpcode() != ExtOpcode || User->getValueType(0) != VT)
11922 if (IsFreeToExtend(N00) && IsFreeToExtend(N01)) {
11923 SDValue Ext0 = DAG.getNode(ExtOpcode, DL, VT, N00);
11924 SDValue Ext1 = DAG.getNode(ExtOpcode, DL, VT, N01);
11925 return DAG.getSetCC(DL, VT, Ext0, Ext1, CC);
11930 // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
11931 // Here, T can be 1 or -1, depending on the type of the setcc and
11932 // getBooleanContents().
11933 unsigned SetCCWidth = N0.getScalarValueSizeInBits();
11935 // To determine the "true" side of the select, we need to know the high bit
11936 // of the value returned by the setcc if it evaluates to true.
11937 // If the type of the setcc is i1, then the true case of the select is just
11938 // sext(i1 1), that is, -1.
11939 // If the type of the setcc is larger (say, i8) then the value of the high
11940 // bit depends on getBooleanContents(), so ask TLI for a real "true" value
11941 // of the appropriate width.
11942 SDValue ExtTrueVal = (SetCCWidth == 1)
11943 ? DAG.getAllOnesConstant(DL, VT)
11944 : DAG.getBoolConstant(true, DL, VT, N00VT);
11945 SDValue Zero = DAG.getConstant(0, DL, VT);
11946 if (SDValue SCC = SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
11949 if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
11950 EVT SetCCVT = getSetCCResultType(N00VT);
11951 // Don't do this transform for i1 because there's a select transform
11952 // that would reverse it.
11953 // TODO: We should not do this transform at all without a target hook
11954 // because a sext is likely cheaper than a select?
11955 if (SetCCVT.getScalarSizeInBits() != 1 &&
11956 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
11957 SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
11958 return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
11965 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
11966 SDValue N0 = N->getOperand(0);
11967 EVT VT = N->getValueType(0);
11970 // sext(undef) = 0 because the top bit will all be the same.
11972 return DAG.getConstant(0, DL, VT);
11974 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
11977 // fold (sext (sext x)) -> (sext x)
11978 // fold (sext (aext x)) -> (sext x)
11979 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
11980 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
11982 if (N0.getOpcode() == ISD::TRUNCATE) {
11983 // fold (sext (truncate (load x))) -> (sext (smaller load x))
11984 // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
11985 if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
11986 SDNode *oye = N0.getOperand(0).getNode();
11987 if (NarrowLoad.getNode() != N0.getNode()) {
11988 CombineTo(N0.getNode(), NarrowLoad);
11989 // CombineTo deleted the truncate, if needed, but not what's under it.
11990 AddToWorklist(oye);
11992 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11995 // See if the value being truncated is already sign extended. If so, just
11996 // eliminate the trunc/sext pair.
11997 SDValue Op = N0.getOperand(0);
11998 unsigned OpBits = Op.getScalarValueSizeInBits();
11999 unsigned MidBits = N0.getScalarValueSizeInBits();
12000 unsigned DestBits = VT.getScalarSizeInBits();
12001 unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
12003 if (OpBits == DestBits) {
12004 // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign
12005 // bits, it is already ready.
12006 if (NumSignBits > DestBits-MidBits)
12008 } else if (OpBits < DestBits) {
12009 // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign
12010 // bits, just sext from i32.
12011 if (NumSignBits > OpBits-MidBits)
12012 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
12014 // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign
12015 // bits, just truncate to i32.
12016 if (NumSignBits > OpBits-MidBits)
12017 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
12020 // fold (sext (truncate x)) -> (sextinreg x).
12021 if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
12022 N0.getValueType())) {
12023 if (OpBits < DestBits)
12024 Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
12025 else if (OpBits > DestBits)
12026 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
12027 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
12028 DAG.getValueType(N0.getValueType()));
12032 // Try to simplify (sext (load x)).
12033 if (SDValue foldedExt =
12034 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
12035 ISD::SEXTLOAD, ISD::SIGN_EXTEND))
12038 if (SDValue foldedExt =
12039 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
12043 // fold (sext (load x)) to multiple smaller sextloads.
12044 // Only on illegal but splittable vectors.
12045 if (SDValue ExtLoad = CombineExtLoad(N))
12048 // Try to simplify (sext (sextload x)).
12049 if (SDValue foldedExt = tryToFoldExtOfExtload(
12050 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD))
12053 // fold (sext (and/or/xor (load x), cst)) ->
12054 // (and/or/xor (sextload x), (sext cst))
12055 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
12056 N0.getOpcode() == ISD::XOR) &&
12057 isa<LoadSDNode>(N0.getOperand(0)) &&
12058 N0.getOperand(1).getOpcode() == ISD::Constant &&
12059 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
12060 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
12061 EVT MemVT = LN00->getMemoryVT();
12062 if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) &&
12063 LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) {
12064 SmallVector<SDNode*, 4> SetCCs;
12065 bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
12066 ISD::SIGN_EXTEND, SetCCs, TLI);
12068 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT,
12069 LN00->getChain(), LN00->getBasePtr(),
12070 LN00->getMemoryVT(),
12071 LN00->getMemOperand());
12072 APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits());
12073 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
12074 ExtLoad, DAG.getConstant(Mask, DL, VT));
12075 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
12076 bool NoReplaceTruncAnd = !N0.hasOneUse();
12077 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
12079 // If N0 has multiple uses, change other uses as well.
12080 if (NoReplaceTruncAnd) {
12082 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
12083 CombineTo(N0.getNode(), TruncAnd);
12085 if (NoReplaceTrunc) {
12086 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
12088 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
12089 LN00->getValueType(0), ExtLoad);
12090 CombineTo(LN00, Trunc, ExtLoad.getValue(1));
12092 return SDValue(N,0); // Return N so it doesn't get rechecked!
12097 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
12100 if (SDValue V = foldSextSetcc(N))
12103 // fold (sext x) -> (zext x) if the sign bit is known zero.
12104 if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
12105 DAG.SignBitIsZero(N0))
12106 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
12108 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
12111 // Eliminate this sign extend by doing a negation in the destination type:
12112 // sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64)
12113 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
12114 isNullOrNullSplat(N0.getOperand(0)) &&
12115 N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND &&
12116 TLI.isOperationLegalOrCustom(ISD::SUB, VT)) {
12117 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT);
12118 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext);
12120 // Eliminate this sign extend by doing a decrement in the destination type:
12121 // sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1)
12122 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
12123 isAllOnesOrAllOnesSplat(N0.getOperand(1)) &&
12124 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
12125 TLI.isOperationLegalOrCustom(ISD::ADD, VT)) {
12126 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
12127 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
12130 // fold sext (not i1 X) -> add (zext i1 X), -1
12131 // TODO: This could be extended to handle bool vectors.
12132 if (N0.getValueType() == MVT::i1 && isBitwiseNot(N0) && N0.hasOneUse() &&
12133 (!LegalOperations || (TLI.isOperationLegal(ISD::ZERO_EXTEND, VT) &&
12134 TLI.isOperationLegal(ISD::ADD, VT)))) {
12135 // If we can eliminate the 'not', the sext form should be better
12136 if (SDValue NewXor = visitXOR(N0.getNode())) {
12137 // Returning N0 is a form of in-visit replacement that may have
12139 if (NewXor.getNode() == N0.getNode()) {
12140 // Return SDValue here as the xor should have already been replaced in
12145 // Return a new sext with the new xor.
12146 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor);
12149 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
12150 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
12153 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
12159 // isTruncateOf - If N is a truncate of some other value, return true, record
12160 // the value being truncated in Op and which of Op's bits are zero/one in Known.
12161 // This function computes KnownBits to avoid a duplicated call to
12162 // computeKnownBits in the caller.
12163 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
12164 KnownBits &Known) {
12165 if (N->getOpcode() == ISD::TRUNCATE) {
12166 Op = N->getOperand(0);
12167 Known = DAG.computeKnownBits(Op);
12171 if (N.getOpcode() != ISD::SETCC ||
12172 N.getValueType().getScalarType() != MVT::i1 ||
12173 cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE)
12176 SDValue Op0 = N->getOperand(0);
12177 SDValue Op1 = N->getOperand(1);
12178 assert(Op0.getValueType() == Op1.getValueType());
12180 if (isNullOrNullSplat(Op0))
12182 else if (isNullOrNullSplat(Op1))
12187 Known = DAG.computeKnownBits(Op);
12189 return (Known.Zero | 1).isAllOnes();
12192 /// Given an extending node with a pop-count operand, if the target does not
12193 /// support a pop-count in the narrow source type but does support it in the
12194 /// destination type, widen the pop-count to the destination type.
12195 static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
12196 assert((Extend->getOpcode() == ISD::ZERO_EXTEND ||
12197 Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op");
12199 SDValue CtPop = Extend->getOperand(0);
12200 if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse())
12203 EVT VT = Extend->getValueType(0);
12204 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12205 if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) ||
12206 !TLI.isOperationLegalOrCustom(ISD::CTPOP, VT))
12209 // zext (ctpop X) --> ctpop (zext X)
12211 SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT);
12212 return DAG.getNode(ISD::CTPOP, DL, VT, NewZext);
12215 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
12216 SDValue N0 = N->getOperand(0);
12217 EVT VT = N->getValueType(0);
12221 return DAG.getConstant(0, SDLoc(N), VT);
12223 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
12226 // fold (zext (zext x)) -> (zext x)
12227 // fold (zext (aext x)) -> (zext x)
12228 if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
12229 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
12232 // fold (zext (truncate x)) -> (zext x) or
12233 // (zext (truncate x)) -> (truncate x)
12234 // This is valid when the truncated bits of x are already zero.
12237 if (isTruncateOf(DAG, N0, Op, Known)) {
12238 APInt TruncatedBits =
12239 (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ?
12240 APInt(Op.getScalarValueSizeInBits(), 0) :
12241 APInt::getBitsSet(Op.getScalarValueSizeInBits(),
12242 N0.getScalarValueSizeInBits(),
12243 std::min(Op.getScalarValueSizeInBits(),
12244 VT.getScalarSizeInBits()));
12245 if (TruncatedBits.isSubsetOf(Known.Zero))
12246 return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
12249 // fold (zext (truncate x)) -> (and x, mask)
12250 if (N0.getOpcode() == ISD::TRUNCATE) {
12251 // fold (zext (truncate (load x))) -> (zext (smaller load x))
12252 // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
12253 if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
12254 SDNode *oye = N0.getOperand(0).getNode();
12255 if (NarrowLoad.getNode() != N0.getNode()) {
12256 CombineTo(N0.getNode(), NarrowLoad);
12257 // CombineTo deleted the truncate, if needed, but not what's under it.
12258 AddToWorklist(oye);
12260 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12263 EVT SrcVT = N0.getOperand(0).getValueType();
12264 EVT MinVT = N0.getValueType();
12266 // Try to mask before the extension to avoid having to generate a larger mask,
12267 // possibly over several sub-vectors.
12268 if (SrcVT.bitsLT(VT) && VT.isVector()) {
12269 if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
12270 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
12271 SDValue Op = N0.getOperand(0);
12272 Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
12273 AddToWorklist(Op.getNode());
12274 SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
12275 // Transfer the debug info; the new node is equivalent to N0.
12276 DAG.transferDbgValues(N0, ZExtOrTrunc);
12277 return ZExtOrTrunc;
12281 if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
12282 SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
12283 AddToWorklist(Op.getNode());
12284 SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
12285 // We may safely transfer the debug info describing the truncate node over
12286 // to the equivalent and operation.
12287 DAG.transferDbgValues(N0, And);
12292 // Fold (zext (and (trunc x), cst)) -> (and x, cst),
12293 // if either of the casts is not free.
12294 if (N0.getOpcode() == ISD::AND &&
12295 N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
12296 N0.getOperand(1).getOpcode() == ISD::Constant &&
12297 (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
12298 N0.getValueType()) ||
12299 !TLI.isZExtFree(N0.getValueType(), VT))) {
12300 SDValue X = N0.getOperand(0).getOperand(0);
12301 X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
12302 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
12304 return DAG.getNode(ISD::AND, DL, VT,
12305 X, DAG.getConstant(Mask, DL, VT));
12308 // Try to simplify (zext (load x)).
12309 if (SDValue foldedExt =
12310 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
12311 ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
12314 if (SDValue foldedExt =
12315 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
12319 // fold (zext (load x)) to multiple smaller zextloads.
12320 // Only on illegal but splittable vectors.
12321 if (SDValue ExtLoad = CombineExtLoad(N))
12324 // fold (zext (and/or/xor (load x), cst)) ->
12325 // (and/or/xor (zextload x), (zext cst))
12326 // Unless (and (load x) cst) will match as a zextload already and has
12327 // additional users.
12328 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
12329 N0.getOpcode() == ISD::XOR) &&
12330 isa<LoadSDNode>(N0.getOperand(0)) &&
12331 N0.getOperand(1).getOpcode() == ISD::Constant &&
12332 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
12333 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
12334 EVT MemVT = LN00->getMemoryVT();
12335 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) &&
12336 LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) {
12337 bool DoXform = true;
12338 SmallVector<SDNode*, 4> SetCCs;
12339 if (!N0.hasOneUse()) {
12340 if (N0.getOpcode() == ISD::AND) {
12341 auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
12342 EVT LoadResultTy = AndC->getValueType(0);
12344 if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT))
12349 DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
12350 ISD::ZERO_EXTEND, SetCCs, TLI);
12352 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT,
12353 LN00->getChain(), LN00->getBasePtr(),
12354 LN00->getMemoryVT(),
12355 LN00->getMemOperand());
12356 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
12358 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
12359 ExtLoad, DAG.getConstant(Mask, DL, VT));
12360 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
12361 bool NoReplaceTruncAnd = !N0.hasOneUse();
12362 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
12364 // If N0 has multiple uses, change other uses as well.
12365 if (NoReplaceTruncAnd) {
12367 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
12368 CombineTo(N0.getNode(), TruncAnd);
12370 if (NoReplaceTrunc) {
12371 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
12373 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
12374 LN00->getValueType(0), ExtLoad);
12375 CombineTo(LN00, Trunc, ExtLoad.getValue(1));
12377 return SDValue(N,0); // Return N so it doesn't get rechecked!
12382 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
12383 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
12384 if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
12387 // Try to simplify (zext (zextload x)).
12388 if (SDValue foldedExt = tryToFoldExtOfExtload(
12389 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD))
12392 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
12395 if (N0.getOpcode() == ISD::SETCC) {
12396 // Propagate fast-math-flags.
12397 SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
12399 // Only do this before legalize for now.
12400 if (!LegalOperations && VT.isVector() &&
12401 N0.getValueType().getVectorElementType() == MVT::i1) {
12402 EVT N00VT = N0.getOperand(0).getValueType();
12403 if (getSetCCResultType(N00VT) == N0.getValueType())
12406 // We know that the # elements of the results is the same as the #
12407 // elements of the compare (and the # elements of the compare result for
12408 // that matter). Check to see that they are the same size. If so, we know
12409 // that the element size of the sext'd result matches the element size of
12410 // the compare operands.
12412 if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
12413 // zext(setcc) -> zext_in_reg(vsetcc) for vectors.
12414 SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
12415 N0.getOperand(1), N0.getOperand(2));
12416 return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType());
12419 // If the desired elements are smaller or larger than the source
12420 // elements we can use a matching integer vector type and then
12421 // truncate/any extend followed by zext_in_reg.
12422 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
12424 DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
12425 N0.getOperand(1), N0.getOperand(2));
12426 return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL,
12427 N0.getValueType());
12430 // zext(setcc x,y,cc) -> zext(select x, y, true, false, cc)
12432 EVT N0VT = N0.getValueType();
12433 EVT N00VT = N0.getOperand(0).getValueType();
12434 if (SDValue SCC = SimplifySelectCC(
12435 DL, N0.getOperand(0), N0.getOperand(1),
12436 DAG.getBoolConstant(true, DL, N0VT, N00VT),
12437 DAG.getBoolConstant(false, DL, N0VT, N00VT),
12438 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
12439 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SCC);
12442 // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
12443 if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
12444 isa<ConstantSDNode>(N0.getOperand(1)) &&
12445 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
12447 SDValue ShAmt = N0.getOperand(1);
12448 if (N0.getOpcode() == ISD::SHL) {
12449 SDValue InnerZExt = N0.getOperand(0);
12450 // If the original shl may be shifting out bits, do not perform this
12452 unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() -
12453 InnerZExt.getOperand(0).getValueSizeInBits();
12454 if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits))
12460 // Ensure that the shift amount is wide enough for the shifted value.
12461 if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits())
12462 ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
12464 return DAG.getNode(N0.getOpcode(), DL, VT,
12465 DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
12469 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
12472 if (SDValue NewCtPop = widenCtPop(N, DAG))
12475 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
12481 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
12482 SDValue N0 = N->getOperand(0);
12483 EVT VT = N->getValueType(0);
12485 // aext(undef) = undef
12487 return DAG.getUNDEF(VT);
12489 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
12492 // fold (aext (aext x)) -> (aext x)
12493 // fold (aext (zext x)) -> (zext x)
12494 // fold (aext (sext x)) -> (sext x)
12495 if (N0.getOpcode() == ISD::ANY_EXTEND ||
12496 N0.getOpcode() == ISD::ZERO_EXTEND ||
12497 N0.getOpcode() == ISD::SIGN_EXTEND)
12498 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
12500 // fold (aext (truncate (load x))) -> (aext (smaller load x))
12501 // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
12502 if (N0.getOpcode() == ISD::TRUNCATE) {
12503 if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
12504 SDNode *oye = N0.getOperand(0).getNode();
12505 if (NarrowLoad.getNode() != N0.getNode()) {
12506 CombineTo(N0.getNode(), NarrowLoad);
12507 // CombineTo deleted the truncate, if needed, but not what's under it.
12508 AddToWorklist(oye);
12510 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12514 // fold (aext (truncate x))
12515 if (N0.getOpcode() == ISD::TRUNCATE)
12516 return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
12518 // Fold (aext (and (trunc x), cst)) -> (and x, cst)
12519 // if the trunc is not free.
12520 if (N0.getOpcode() == ISD::AND &&
12521 N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
12522 N0.getOperand(1).getOpcode() == ISD::Constant &&
12523 !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
12524 N0.getValueType())) {
12526 SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
12527 SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
12528 assert(isa<ConstantSDNode>(Y) && "Expected constant to be folded!");
12529 return DAG.getNode(ISD::AND, DL, VT, X, Y);
12532 // fold (aext (load x)) -> (aext (truncate (extload x)))
12533 // None of the supported targets knows how to perform load and any_ext
12534 // on vectors in one instruction, so attempt to fold to zext instead.
12535 if (VT.isVector()) {
12536 // Try to simplify (zext (load x)).
12537 if (SDValue foldedExt =
12538 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
12539 ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
12541 } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
12542 ISD::isUNINDEXEDLoad(N0.getNode()) &&
12543 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
12544 bool DoXform = true;
12545 SmallVector<SDNode *, 4> SetCCs;
12546 if (!N0.hasOneUse())
12548 ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
12550 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12551 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
12552 LN0->getChain(), LN0->getBasePtr(),
12553 N0.getValueType(), LN0->getMemOperand());
12554 ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
12555 // If the load value is used only by N, replace it via CombineTo N.
12556 bool NoReplaceTrunc = N0.hasOneUse();
12557 CombineTo(N, ExtLoad);
12558 if (NoReplaceTrunc) {
12559 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
12560 recursivelyDeleteUnusedNodes(LN0);
12563 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
12564 CombineTo(LN0, Trunc, ExtLoad.getValue(1));
12566 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12570 // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
12571 // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
12572 // fold (aext ( extload x)) -> (aext (truncate (extload x)))
12573 if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) &&
12574 ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
12575 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12576 ISD::LoadExtType ExtType = LN0->getExtensionType();
12577 EVT MemVT = LN0->getMemoryVT();
12578 if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
12579 SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
12580 VT, LN0->getChain(), LN0->getBasePtr(),
12581 MemVT, LN0->getMemOperand());
12582 CombineTo(N, ExtLoad);
12583 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
12584 recursivelyDeleteUnusedNodes(LN0);
12585 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12589 if (N0.getOpcode() == ISD::SETCC) {
12590 // Propagate fast-math-flags.
12591 SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
12594 // aext(setcc) -> vsetcc
12595 // aext(setcc) -> truncate(vsetcc)
12596 // aext(setcc) -> aext(vsetcc)
12597 // Only do this before legalize for now.
12598 if (VT.isVector() && !LegalOperations) {
12599 EVT N00VT = N0.getOperand(0).getValueType();
12600 if (getSetCCResultType(N00VT) == N0.getValueType())
12603 // We know that the # elements of the results is the same as the
12604 // # elements of the compare (and the # elements of the compare result
12605 // for that matter). Check to see that they are the same size. If so,
12606 // we know that the element size of the sext'd result matches the
12607 // element size of the compare operands.
12608 if (VT.getSizeInBits() == N00VT.getSizeInBits())
12609 return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
12611 cast<CondCodeSDNode>(N0.getOperand(2))->get());
12613 // If the desired elements are smaller or larger than the source
12614 // elements we can use a matching integer vector type and then
12615 // truncate/any extend
12616 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
12618 DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
12620 cast<CondCodeSDNode>(N0.getOperand(2))->get());
12621 return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
12624 // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
12626 if (SDValue SCC = SimplifySelectCC(
12627 DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
12628 DAG.getConstant(0, DL, VT),
12629 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
12633 if (SDValue NewCtPop = widenCtPop(N, DAG))
12636 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
12642 SDValue DAGCombiner::visitAssertExt(SDNode *N) {
12643 unsigned Opcode = N->getOpcode();
12644 SDValue N0 = N->getOperand(0);
12645 SDValue N1 = N->getOperand(1);
12646 EVT AssertVT = cast<VTSDNode>(N1)->getVT();
12648 // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
12649 if (N0.getOpcode() == Opcode &&
12650 AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
12653 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
12654 N0.getOperand(0).getOpcode() == Opcode) {
12655 // We have an assert, truncate, assert sandwich. Make one stronger assert
12656 // by asserting on the smallest asserted type to the larger source type.
12657 // This eliminates the later assert:
12658 // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
12659 // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
12661 SDValue BigA = N0.getOperand(0);
12662 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
12663 EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
12664 SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
12665 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
12666 BigA.getOperand(0), MinAssertVTVal);
12667 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
12670 // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller
12671 // than X. Just move the AssertZext in front of the truncate and drop the
12673 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
12674 N0.getOperand(0).getOpcode() == ISD::AssertSext &&
12675 Opcode == ISD::AssertZext) {
12676 SDValue BigA = N0.getOperand(0);
12677 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
12678 if (AssertVT.bitsLT(BigA_AssertVT)) {
12680 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
12681 BigA.getOperand(0), N1);
12682 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
12689 SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
12692 Align AL = cast<AssertAlignSDNode>(N)->getAlign();
12693 SDValue N0 = N->getOperand(0);
12695 // Fold (assertalign (assertalign x, AL0), AL1) ->
12696 // (assertalign x, max(AL0, AL1))
12697 if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
12698 return DAG.getAssertAlign(DL, N0.getOperand(0),
12699 std::max(AL, AAN->getAlign()));
12701 // In rare cases, there are trivial arithmetic ops in source operands. Sink
12702 // this assert down to source operands so that those arithmetic ops could be
12703 // exposed to the DAG combining.
12704 switch (N0.getOpcode()) {
12709 unsigned AlignShift = Log2(AL);
12710 SDValue LHS = N0.getOperand(0);
12711 SDValue RHS = N0.getOperand(1);
12712 unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
12713 unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
12714 if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {
12715 if (LHSAlignShift < AlignShift)
12716 LHS = DAG.getAssertAlign(DL, LHS, AL);
12717 if (RHSAlignShift < AlignShift)
12718 RHS = DAG.getAssertAlign(DL, RHS, AL);
12719 return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
12728 /// If the result of a load is shifted/masked/truncated to an effectively
12729 /// narrower type, try to transform the load to a narrower type and/or
12730 /// use an extending load.
12731 SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
12732 unsigned Opc = N->getOpcode();
12734 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
12735 SDValue N0 = N->getOperand(0);
12736 EVT VT = N->getValueType(0);
12739 // This transformation isn't valid for vector loads.
12743 // The ShAmt variable is used to indicate that we've consumed a right
12744 // shift. I.e. we want to narrow the width of the load by skipping to load the
12745 // ShAmt least significant bits.
12746 unsigned ShAmt = 0;
12747 // A special case is when the least significant bits from the load are masked
12748 // away, but using an AND rather than a right shift. HasShiftedOffset is used
12749 // to indicate that the narrowed load should be left-shifted ShAmt bits to get
12751 bool HasShiftedOffset = false;
12752 // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
12754 if (Opc == ISD::SIGN_EXTEND_INREG) {
12755 ExtType = ISD::SEXTLOAD;
12756 ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
12757 } else if (Opc == ISD::SRL || Opc == ISD::SRA) {
12758 // Another special-case: SRL/SRA is basically zero/sign-extending a narrower
12759 // value, or it may be shifting a higher subword, half or byte into the
12762 // Only handle shift with constant shift amount, and the shiftee must be a
12764 auto *LN = dyn_cast<LoadSDNode>(N0);
12765 auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
12768 // If the shift amount is larger than the memory type then we're not
12769 // accessing any of the loaded bytes.
12770 ShAmt = N1C->getZExtValue();
12771 uint64_t MemoryWidth = LN->getMemoryVT().getScalarSizeInBits();
12772 if (MemoryWidth <= ShAmt)
12774 // Attempt to fold away the SRL by using ZEXTLOAD and SRA by using SEXTLOAD.
12775 ExtType = Opc == ISD::SRL ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
12776 ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
12777 // If original load is a SEXTLOAD then we can't simply replace it by a
12778 // ZEXTLOAD (we could potentially replace it by a more narrow SEXTLOAD
12779 // followed by a ZEXT, but that is not handled at the moment). Similarly if
12780 // the original load is a ZEXTLOAD and we want to use a SEXTLOAD.
12781 if ((LN->getExtensionType() == ISD::SEXTLOAD ||
12782 LN->getExtensionType() == ISD::ZEXTLOAD) &&
12783 LN->getExtensionType() != ExtType)
12785 } else if (Opc == ISD::AND) {
12786 // An AND with a constant mask is the same as a truncate + zero-extend.
12787 auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
12791 const APInt &Mask = AndC->getAPIntValue();
12792 unsigned ActiveBits = 0;
12793 if (Mask.isMask()) {
12794 ActiveBits = Mask.countTrailingOnes();
12795 } else if (Mask.isShiftedMask(ShAmt, ActiveBits)) {
12796 HasShiftedOffset = true;
12801 ExtType = ISD::ZEXTLOAD;
12802 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
12805 // In case Opc==SRL we've already prepared ExtVT/ExtType/ShAmt based on doing
12806 // a right shift. Here we redo some of those checks, to possibly adjust the
12807 // ExtVT even further based on "a masking AND". We could also end up here for
12808 // other reasons (e.g. based on Opc==TRUNCATE) and that is why some checks
12809 // need to be done here as well.
12810 if (Opc == ISD::SRL || N0.getOpcode() == ISD::SRL) {
12811 SDValue SRL = Opc == ISD::SRL ? SDValue(N, 0) : N0;
12812 // Bail out when the SRL has more than one use. This is done for historical
12813 // (undocumented) reasons. Maybe intent was to guard the AND-masking below
12814 // check below? And maybe it could be non-profitable to do the transform in
12815 // case the SRL has multiple uses and we get here with Opc!=ISD::SRL?
12816 // FIXME: Can't we just skip this check for the Opc==ISD::SRL case.
12817 if (!SRL.hasOneUse())
12820 // Only handle shift with constant shift amount, and the shiftee must be a
12822 auto *LN = dyn_cast<LoadSDNode>(SRL.getOperand(0));
12823 auto *SRL1C = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
12827 // If the shift amount is larger than the input type then we're not
12828 // accessing any of the loaded bytes. If the load was a zextload/extload
12829 // then the result of the shift+trunc is zero/undef (handled elsewhere).
12830 ShAmt = SRL1C->getZExtValue();
12831 uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits();
12832 if (ShAmt >= MemoryWidth)
12835 // Because a SRL must be assumed to *need* to zero-extend the high bits
12836 // (as opposed to anyext the high bits), we can't combine the zextload
12837 // lowering of SRL and an sextload.
12838 if (LN->getExtensionType() == ISD::SEXTLOAD)
12841 // Avoid reading outside the memory accessed by the original load (could
12842 // happened if we only adjust the load base pointer by ShAmt). Instead we
12843 // try to narrow the load even further. The typical scenario here is:
12844 // (i64 (truncate (i96 (srl (load x), 64)))) ->
12845 // (i64 (truncate (i96 (zextload (load i32 + offset) from i32))))
12846 if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) {
12847 // Don't replace sextload by zextload.
12848 if (ExtType == ISD::SEXTLOAD)
12850 // Narrow the load.
12851 ExtType = ISD::ZEXTLOAD;
12852 ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
12855 // If the SRL is only used by a masking AND, we may be able to adjust
12856 // the ExtVT to make the AND redundant.
12857 SDNode *Mask = *(SRL->use_begin());
12858 if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
12859 isa<ConstantSDNode>(Mask->getOperand(1))) {
12860 const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
12861 if (ShiftMask.isMask()) {
12862 EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
12863 ShiftMask.countTrailingOnes());
12864 // If the mask is smaller, recompute the type.
12865 if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
12866 TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
12871 N0 = SRL.getOperand(0);
12874 // If the load is shifted left (and the result isn't shifted back right), we
12875 // can fold a truncate through the shift. The typical scenario is that N
12876 // points at a TRUNCATE here so the attempted fold is:
12877 // (truncate (shl (load x), c))) -> (shl (narrow load x), c)
12878 // ShLeftAmt will indicate how much a narrowed load should be shifted left.
12879 unsigned ShLeftAmt = 0;
12880 if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
12881 ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
12882 if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
12883 ShLeftAmt = N01->getZExtValue();
12884 N0 = N0.getOperand(0);
12888 // If we haven't found a load, we can't narrow it.
12889 if (!isa<LoadSDNode>(N0))
12892 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12893 // Reducing the width of a volatile load is illegal. For atomics, we may be
12894 // able to reduce the width provided we never widen again. (see D66309)
12895 if (!LN0->isSimple() ||
12896 !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
12899 auto AdjustBigEndianShift = [&](unsigned ShAmt) {
12900 unsigned LVTStoreBits =
12901 LN0->getMemoryVT().getStoreSizeInBits().getFixedSize();
12902 unsigned EVTStoreBits = ExtVT.getStoreSizeInBits().getFixedSize();
12903 return LVTStoreBits - EVTStoreBits - ShAmt;
12906 // We need to adjust the pointer to the load by ShAmt bits in order to load
12907 // the correct bytes.
12908 unsigned PtrAdjustmentInBits =
12909 DAG.getDataLayout().isBigEndian() ? AdjustBigEndianShift(ShAmt) : ShAmt;
12911 uint64_t PtrOff = PtrAdjustmentInBits / 8;
12912 Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff);
12914 // The original load itself didn't wrap, so an offset within it doesn't.
12916 Flags.setNoUnsignedWrap(true);
12917 SDValue NewPtr = DAG.getMemBasePlusOffset(LN0->getBasePtr(),
12918 TypeSize::Fixed(PtrOff), DL, Flags);
12919 AddToWorklist(NewPtr.getNode());
12922 if (ExtType == ISD::NON_EXTLOAD)
12923 Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr,
12924 LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
12925 LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
12927 Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr,
12928 LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
12929 NewAlign, LN0->getMemOperand()->getFlags(),
12932 // Replace the old load's chain with the new load's chain.
12933 WorklistRemover DeadNodes(*this);
12934 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
12936 // Shift the result left, if we've swallowed a left shift.
12937 SDValue Result = Load;
12938 if (ShLeftAmt != 0) {
12939 EVT ShImmTy = getShiftAmountTy(Result.getValueType());
12940 if (!isUIntN(ShImmTy.getScalarSizeInBits(), ShLeftAmt))
12942 // If the shift amount is as large as the result size (but, presumably,
12943 // no larger than the source) then the useful bits of the result are
12944 // zero; we can't simply return the shortened shift, because the result
12945 // of that operation is undefined.
12946 if (ShLeftAmt >= VT.getScalarSizeInBits())
12947 Result = DAG.getConstant(0, DL, VT);
12949 Result = DAG.getNode(ISD::SHL, DL, VT,
12950 Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
12953 if (HasShiftedOffset) {
12954 // We're using a shifted mask, so the load now has an offset. This means
12955 // that data has been loaded into the lower bytes than it would have been
12956 // before, so we need to shl the loaded data into the correct position in the
12958 SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
12959 Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
12960 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
12963 // Return the new loaded value.
12967 SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
12968 SDValue N0 = N->getOperand(0);
12969 SDValue N1 = N->getOperand(1);
12970 EVT VT = N->getValueType(0);
12971 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
12972 unsigned VTBits = VT.getScalarSizeInBits();
12973 unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
12975 // sext_vector_inreg(undef) = 0 because the top bit will all be the same.
12977 return DAG.getConstant(0, SDLoc(N), VT);
12979 // fold (sext_in_reg c1) -> c1
12980 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
12981 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
12983 // If the input is already sign extended, just drop the extension.
12984 if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0))
12987 // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
12988 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
12989 ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
12990 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),
12993 // fold (sext_in_reg (sext x)) -> (sext x)
12994 // fold (sext_in_reg (aext x)) -> (sext x)
12995 // if x is small enough or if we know that x has more than 1 sign bit and the
12996 // sign_extend_inreg is extending from one of them.
12997 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
12998 SDValue N00 = N0.getOperand(0);
12999 unsigned N00Bits = N00.getScalarValueSizeInBits();
13000 if ((N00Bits <= ExtVTBits ||
13001 DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits) &&
13002 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
13003 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
13006 // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
13007 // if x is small enough or if we know that x has more than 1 sign bit and the
13008 // sign_extend_inreg is extending from one of them.
13009 if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
13010 N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
13011 N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) {
13012 SDValue N00 = N0.getOperand(0);
13013 unsigned N00Bits = N00.getScalarValueSizeInBits();
13014 unsigned DstElts = N0.getValueType().getVectorMinNumElements();
13015 unsigned SrcElts = N00.getValueType().getVectorMinNumElements();
13016 bool IsZext = N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG;
13017 APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts);
13018 if ((N00Bits == ExtVTBits ||
13019 (!IsZext && (N00Bits < ExtVTBits ||
13020 DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits))) &&
13021 (!LegalOperations ||
13022 TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)))
13023 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00);
13026 // fold (sext_in_reg (zext x)) -> (sext x)
13027 // iff we are extending the source sign bit.
13028 if (N0.getOpcode() == ISD::ZERO_EXTEND) {
13029 SDValue N00 = N0.getOperand(0);
13030 if (N00.getScalarValueSizeInBits() == ExtVTBits &&
13031 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
13032 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
13035 // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
13036 if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))
13037 return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);
13039 // fold operands of sext_in_reg based on knowledge that the top bits are not
13041 if (SimplifyDemandedBits(SDValue(N, 0)))
13042 return SDValue(N, 0);
13044 // fold (sext_in_reg (load x)) -> (smaller sextload x)
13045 // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
13046 if (SDValue NarrowLoad = reduceLoadWidth(N))
13049 // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
13050 // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
13051 // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
13052 if (N0.getOpcode() == ISD::SRL) {
13053 if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
13054 if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) {
13055 // We can turn this into an SRA iff the input to the SRL is already sign
13056 // extended enough.
13057 unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
13058 if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)
13059 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
13064 // fold (sext_inreg (extload x)) -> (sextload x)
13065 // If sextload is not supported by target, we can only do the combine when
13066 // load has one use. Doing otherwise can block folding the extload with other
13067 // extends that the target does support.
13068 if (ISD::isEXTLoad(N0.getNode()) &&
13069 ISD::isUNINDEXEDLoad(N0.getNode()) &&
13070 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
13071 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
13073 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
13074 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13075 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
13077 LN0->getBasePtr(), ExtVT,
13078 LN0->getMemOperand());
13079 CombineTo(N, ExtLoad);
13080 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
13081 AddToWorklist(ExtLoad.getNode());
13082 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13085 // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
13086 if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
13088 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
13089 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
13090 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
13091 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13092 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
13094 LN0->getBasePtr(), ExtVT,
13095 LN0->getMemOperand());
13096 CombineTo(N, ExtLoad);
13097 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
13098 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13101 // fold (sext_inreg (masked_load x)) -> (sext_masked_load x)
13102 // ignore it if the masked load is already sign extended
13103 if (MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0)) {
13104 if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() &&
13105 Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD &&
13106 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) {
13107 SDValue ExtMaskedLoad = DAG.getMaskedLoad(
13108 VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(),
13109 Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(),
13110 Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad());
13111 CombineTo(N, ExtMaskedLoad);
13112 CombineTo(N0.getNode(), ExtMaskedLoad, ExtMaskedLoad.getValue(1));
13113 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13117 // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
13118 if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
13119 if (SDValue(GN0, 0).hasOneUse() &&
13120 ExtVT == GN0->getMemoryVT() &&
13121 TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
13122 SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
13123 GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
13125 SDValue ExtLoad = DAG.getMaskedGather(
13126 DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
13127 GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
13129 CombineTo(N, ExtLoad);
13130 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
13131 AddToWorklist(ExtLoad.getNode());
13132 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13136 // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
13137 if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
13138 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
13139 N0.getOperand(1), false))
13140 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
13146 SDValue DAGCombiner::visitEXTEND_VECTOR_INREG(SDNode *N) {
13147 SDValue N0 = N->getOperand(0);
13148 EVT VT = N->getValueType(0);
13150 // {s/z}ext_vector_inreg(undef) = 0 because the top bits must be the same.
13152 return DAG.getConstant(0, SDLoc(N), VT);
13154 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
13157 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
13158 return SDValue(N, 0);
13163 SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
13164 SDValue N0 = N->getOperand(0);
13165 EVT VT = N->getValueType(0);
13166 EVT SrcVT = N0.getValueType();
13167 bool isLE = DAG.getDataLayout().isLittleEndian();
13173 // fold (truncate (truncate x)) -> (truncate x)
13174 if (N0.getOpcode() == ISD::TRUNCATE)
13175 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
13177 // fold (truncate c1) -> c1
13178 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
13179 SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
13180 if (C.getNode() != N)
13184 // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
13185 if (N0.getOpcode() == ISD::ZERO_EXTEND ||
13186 N0.getOpcode() == ISD::SIGN_EXTEND ||
13187 N0.getOpcode() == ISD::ANY_EXTEND) {
13188 // if the source is smaller than the dest, we still need an extend.
13189 if (N0.getOperand(0).getValueType().bitsLT(VT))
13190 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
13191 // if the source is larger than the dest, than we just need the truncate.
13192 if (N0.getOperand(0).getValueType().bitsGT(VT))
13193 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
13194 // if the source and dest are the same type, we can drop both the extend
13195 // and the truncate.
13196 return N0.getOperand(0);
13199 // Try to narrow a truncate-of-sext_in_reg to the destination type:
13200 // trunc (sign_ext_inreg X, iM) to iN --> sign_ext_inreg (trunc X to iN), iM
13201 if (!LegalTypes && N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
13203 SDValue X = N0.getOperand(0);
13204 SDValue ExtVal = N0.getOperand(1);
13205 EVT ExtVT = cast<VTSDNode>(ExtVal)->getVT();
13206 if (ExtVT.bitsLT(VT)) {
13207 SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
13208 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal);
13212 // If this is anyext(trunc), don't fold it, allow ourselves to be folded.
13213 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
13216 // Fold extract-and-trunc into a narrow extract. For example:
13217 // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
13218 // i32 y = TRUNCATE(i64 x)
13220 // v16i8 b = BITCAST (v2i64 val)
13221 // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
13223 // Note: We only run this optimization after type legalization (which often
13224 // creates this pattern) and before operation legalization after which
13225 // we need to be more careful about the vector instructions that we generate.
13226 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13227 LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
13228 EVT VecTy = N0.getOperand(0).getValueType();
13229 EVT ExTy = N0.getValueType();
13230 EVT TrTy = N->getValueType(0);
13232 auto EltCnt = VecTy.getVectorElementCount();
13233 unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
13234 auto NewEltCnt = EltCnt * SizeRatio;
13236 EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
13237 assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
13239 SDValue EltNo = N0->getOperand(1);
13240 if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
13241 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
13242 int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
13245 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
13246 DAG.getBitcast(NVT, N0.getOperand(0)),
13247 DAG.getVectorIdxConstant(Index, DL));
13251 // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
13252 if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
13253 if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
13254 TLI.isTruncateFree(SrcVT, VT)) {
13256 SDValue Cond = N0.getOperand(0);
13257 SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
13258 SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
13259 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
13263 // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
13264 if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
13265 (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) &&
13266 TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
13267 SDValue Amt = N0.getOperand(1);
13268 KnownBits Known = DAG.computeKnownBits(Amt);
13269 unsigned Size = VT.getScalarSizeInBits();
13270 if (Known.countMaxActiveBits() <= Log2_32(Size)) {
13272 EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
13274 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
13275 if (AmtVT != Amt.getValueType()) {
13276 Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
13277 AddToWorklist(Amt.getNode());
13279 return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
13283 if (SDValue V = foldSubToUSubSat(VT, N0.getNode()))
13286 // Attempt to pre-truncate BUILD_VECTOR sources.
13287 if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
13288 TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
13289 // Avoid creating illegal types if running after type legalizer.
13290 (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
13292 EVT SVT = VT.getScalarType();
13293 SmallVector<SDValue, 8> TruncOps;
13294 for (const SDValue &Op : N0->op_values()) {
13295 SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op);
13296 TruncOps.push_back(TruncOp);
13298 return DAG.getBuildVector(VT, DL, TruncOps);
13301 // Fold a series of buildvector, bitcast, and truncate if possible.
13302 // For example fold
13303 // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
13304 // (2xi32 (buildvector x, y)).
13305 if (Level == AfterLegalizeVectorOps && VT.isVector() &&
13306 N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
13307 N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
13308 N0.getOperand(0).hasOneUse()) {
13309 SDValue BuildVect = N0.getOperand(0);
13310 EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
13311 EVT TruncVecEltTy = VT.getVectorElementType();
13313 // Check that the element types match.
13314 if (BuildVectEltTy == TruncVecEltTy) {
13315 // Now we only need to compute the offset of the truncated elements.
13316 unsigned BuildVecNumElts = BuildVect.getNumOperands();
13317 unsigned TruncVecNumElts = VT.getVectorNumElements();
13318 unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
13320 assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
13321 "Invalid number of elements");
13323 SmallVector<SDValue, 8> Opnds;
13324 for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
13325 Opnds.push_back(BuildVect.getOperand(i));
13327 return DAG.getBuildVector(VT, SDLoc(N), Opnds);
13331 // fold (truncate (load x)) -> (smaller load x)
13332 // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
13333 if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
13334 if (SDValue Reduced = reduceLoadWidth(N))
13337 // Handle the case where the load remains an extending load even
13338 // after truncation.
13339 if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
13340 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13341 if (LN0->isSimple() && LN0->getMemoryVT().bitsLT(VT)) {
13342 SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
13343 VT, LN0->getChain(), LN0->getBasePtr(),
13344 LN0->getMemoryVT(),
13345 LN0->getMemOperand());
13346 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
13352 // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
13353 // where ... are all 'undef'.
13354 if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
13355 SmallVector<EVT, 8> VTs;
13358 unsigned NumDefs = 0;
13360 for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
13361 SDValue X = N0.getOperand(i);
13362 if (!X.isUndef()) {
13367 // Stop if more than one members are non-undef.
13371 VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
13372 VT.getVectorElementType(),
13373 X.getValueType().getVectorElementCount()));
13377 return DAG.getUNDEF(VT);
13379 if (NumDefs == 1) {
13380 assert(V.getNode() && "The single defined operand is empty!");
13381 SmallVector<SDValue, 8> Opnds;
13382 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
13384 Opnds.push_back(DAG.getUNDEF(VTs[i]));
13387 SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
13388 AddToWorklist(NV.getNode());
13389 Opnds.push_back(NV);
13391 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
13395 // Fold truncate of a bitcast of a vector to an extract of the low vector
13398 // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
13399 if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
13400 SDValue VecSrc = N0.getOperand(0);
13401 EVT VecSrcVT = VecSrc.getValueType();
13402 if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT &&
13403 (!LegalOperations ||
13404 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
13407 unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
13408 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
13409 DAG.getVectorIdxConstant(Idx, SL));
13413 // Simplify the operands using demanded-bits information.
13414 if (SimplifyDemandedBits(SDValue(N, 0)))
13415 return SDValue(N, 0);
13417 // fold (truncate (extract_subvector(ext x))) ->
13418 // (extract_subvector x)
13419 // TODO: This can be generalized to cover cases where the truncate and extract
13420 // do not fully cancel each other out.
13421 if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
13422 SDValue N00 = N0.getOperand(0);
13423 if (N00.getOpcode() == ISD::SIGN_EXTEND ||
13424 N00.getOpcode() == ISD::ZERO_EXTEND ||
13425 N00.getOpcode() == ISD::ANY_EXTEND) {
13426 if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
13427 VT.getVectorElementType())
13428 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
13429 N00.getOperand(0), N0.getOperand(1));
13433 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
13436 // Narrow a suitable binary operation with a non-opaque constant operand by
13437 // moving it ahead of the truncate. This is limited to pre-legalization
13438 // because targets may prefer a wider type during later combines and invert
13440 switch (N0.getOpcode()) {
13447 if (!LegalOperations && N0.hasOneUse() &&
13448 (isConstantOrConstantVector(N0.getOperand(0), true) ||
13449 isConstantOrConstantVector(N0.getOperand(1), true))) {
13450 // TODO: We already restricted this to pre-legalization, but for vectors
13451 // we are extra cautious to not create an unsupported operation.
13452 // Target-specific changes are likely needed to avoid regressions here.
13453 if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
13455 SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
13456 SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
13457 return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
13462 case ISD::ADDCARRY:
13463 // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
13464 // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
13465 // When the adde's carry is not used.
13466 // We only do for addcarry before legalize operation
13467 if (((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) ||
13468 TLI.isOperationLegal(N0.getOpcode(), VT)) &&
13469 N0.hasOneUse() && !N0->hasAnyUseOfValue(1)) {
13471 SDValue X = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
13472 SDValue Y = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
13473 SDVTList VTs = DAG.getVTList(VT, N0->getValueType(1));
13474 return DAG.getNode(N0.getOpcode(), DL, VTs, X, Y, N0.getOperand(2));
13478 // Truncate the USUBSAT only if LHS is a known zero-extension, its not
13479 // enough to know that the upper bits are zero we must ensure that we don't
13480 // introduce an extra truncate.
13481 if (!LegalOperations && N0.hasOneUse() &&
13482 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
13483 N0.getOperand(0).getOperand(0).getScalarValueSizeInBits() <=
13484 VT.getScalarSizeInBits() &&
13485 hasOperation(N0.getOpcode(), VT)) {
13486 return getTruncatedUSUBSAT(VT, SrcVT, N0.getOperand(0), N0.getOperand(1),
13495 static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
13496 SDValue Elt = N->getOperand(i);
13497 if (Elt.getOpcode() != ISD::MERGE_VALUES)
13498 return Elt.getNode();
13499 return Elt.getOperand(Elt.getResNo()).getNode();
13502 /// build_pair (load, load) -> load
13503 /// if load locations are consecutive.
13504 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
13505 assert(N->getOpcode() == ISD::BUILD_PAIR);
13507 auto *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
13508 auto *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
13510 // A BUILD_PAIR is always having the least significant part in elt 0 and the
13511 // most significant part in elt 1. So when combining into one large load, we
13512 // need to consider the endianness.
13513 if (DAG.getDataLayout().isBigEndian())
13514 std::swap(LD1, LD2);
13516 if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !ISD::isNON_EXTLoad(LD2) ||
13517 !LD1->hasOneUse() || !LD2->hasOneUse() ||
13518 LD1->getAddressSpace() != LD2->getAddressSpace())
13521 bool LD1Fast = false;
13522 EVT LD1VT = LD1->getValueType(0);
13523 unsigned LD1Bytes = LD1VT.getStoreSize();
13524 if ((!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
13525 DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1) &&
13526 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
13527 *LD1->getMemOperand(), &LD1Fast) && LD1Fast)
13528 return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
13529 LD1->getPointerInfo(), LD1->getAlign());
13534 static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
13535 // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
13536 // and Lo parts; on big-endian machines it doesn't.
13537 return DAG.getDataLayout().isBigEndian() ? 1 : 0;
13540 static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
13541 const TargetLowering &TLI) {
13542 // If this is not a bitcast to an FP type or if the target doesn't have
13543 // IEEE754-compliant FP logic, we're done.
13544 EVT VT = N->getValueType(0);
13545 if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT))
13548 // TODO: Handle cases where the integer constant is a different scalar
13549 // bitwidth to the FP.
13550 SDValue N0 = N->getOperand(0);
13551 EVT SourceVT = N0.getValueType();
13552 if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits())
13557 switch (N0.getOpcode()) {
13559 FPOpcode = ISD::FABS;
13560 SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits());
13563 FPOpcode = ISD::FNEG;
13564 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
13567 FPOpcode = ISD::FABS;
13568 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
13574 // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
13575 // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
13576 // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) ->
13578 SDValue LogicOp0 = N0.getOperand(0);
13579 ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true);
13580 if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
13581 LogicOp0.getOpcode() == ISD::BITCAST &&
13582 LogicOp0.getOperand(0).getValueType() == VT) {
13583 SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0));
13584 NumFPLogicOpsConv++;
13585 if (N0.getOpcode() == ISD::OR)
13586 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp);
13593 SDValue DAGCombiner::visitBITCAST(SDNode *N) {
13594 SDValue N0 = N->getOperand(0);
13595 EVT VT = N->getValueType(0);
13598 return DAG.getUNDEF(VT);
13600 // If the input is a BUILD_VECTOR with all constant elements, fold this now.
13601 // Only do this before legalize types, unless both types are integer and the
13602 // scalar type is legal. Only do this before legalize ops, since the target
13603 // maybe depending on the bitcast.
13604 // First check to see if this is all constant.
13605 // TODO: Support FP bitcasts after legalize types.
13606 if (VT.isVector() &&
13608 (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
13609 TLI.isTypeLegal(VT.getVectorElementType()))) &&
13610 N0.getOpcode() == ISD::BUILD_VECTOR && N0->hasOneUse() &&
13611 cast<BuildVectorSDNode>(N0)->isConstant())
13612 return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
13613 VT.getVectorElementType());
13615 // If the input is a constant, let getNode fold it.
13616 if (isIntOrFPConstant(N0)) {
13617 // If we can't allow illegal operations, we need to check that this is just
13618 // a fp -> int or int -> conversion and that the resulting operation will
13620 if (!LegalOperations ||
13621 (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
13622 TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
13623 (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
13624 TLI.isOperationLegal(ISD::Constant, VT))) {
13625 SDValue C = DAG.getBitcast(VT, N0);
13626 if (C.getNode() != N)
13631 // (conv (conv x, t1), t2) -> (conv x, t2)
13632 if (N0.getOpcode() == ISD::BITCAST)
13633 return DAG.getBitcast(VT, N0.getOperand(0));
13635 // fold (conv (load x)) -> (load (conv*)x)
13636 // If the resultant load doesn't need a higher alignment than the original!
13637 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
13638 // Do not remove the cast if the types differ in endian layout.
13639 TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
13640 TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
13641 // If the load is volatile, we only want to change the load type if the
13642 // resulting load is legal. Otherwise we might increase the number of
13643 // memory accesses. We don't care if the original type was legal or not
13644 // as we assume software couldn't rely on the number of accesses of an
13646 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
13647 TLI.isOperationLegal(ISD::LOAD, VT))) {
13648 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13650 if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
13651 *LN0->getMemOperand())) {
13653 DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
13654 LN0->getPointerInfo(), LN0->getAlign(),
13655 LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
13656 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
13661 if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
13664 // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
13665 // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
13668 // fold (bitcast (fneg x)) ->
13669 // flipbit = signbit
13670 // (xor (bitcast x) (build_pair flipbit, flipbit))
13672 // fold (bitcast (fabs x)) ->
13673 // flipbit = (and (extract_element (bitcast x), 0), signbit)
13674 // (xor (bitcast x) (build_pair flipbit, flipbit))
13675 // This often reduces constant pool loads.
13676 if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
13677 (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
13678 N0->hasOneUse() && VT.isInteger() && !VT.isVector() &&
13679 !N0.getValueType().isVector()) {
13680 SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
13681 AddToWorklist(NewConv.getNode());
13684 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
13685 assert(VT.getSizeInBits() == 128);
13686 SDValue SignBit = DAG.getConstant(
13687 APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
13689 if (N0.getOpcode() == ISD::FNEG) {
13691 AddToWorklist(FlipBit.getNode());
13693 assert(N0.getOpcode() == ISD::FABS);
13695 DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
13696 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
13698 AddToWorklist(Hi.getNode());
13699 FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
13700 AddToWorklist(FlipBit.getNode());
13703 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
13704 AddToWorklist(FlipBits.getNode());
13705 return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
13707 APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
13708 if (N0.getOpcode() == ISD::FNEG)
13709 return DAG.getNode(ISD::XOR, DL, VT,
13710 NewConv, DAG.getConstant(SignBit, DL, VT));
13711 assert(N0.getOpcode() == ISD::FABS);
13712 return DAG.getNode(ISD::AND, DL, VT,
13713 NewConv, DAG.getConstant(~SignBit, DL, VT));
13716 // fold (bitconvert (fcopysign cst, x)) ->
13717 // (or (and (bitconvert x), sign), (and cst, (not sign)))
13718 // Note that we don't handle (copysign x, cst) because this can always be
13719 // folded to an fneg or fabs.
13722 // fold (bitcast (fcopysign cst, x)) ->
13723 // flipbit = (and (extract_element
13724 // (xor (bitcast cst), (bitcast x)), 0),
13726 // (xor (bitcast cst) (build_pair flipbit, flipbit))
13727 if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() &&
13728 isa<ConstantFPSDNode>(N0.getOperand(0)) && VT.isInteger() &&
13730 unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
13731 EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
13732 if (isTypeLegal(IntXVT)) {
13733 SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
13734 AddToWorklist(X.getNode());
13736 // If X has a different width than the result/lhs, sext it or truncate it.
13737 unsigned VTWidth = VT.getSizeInBits();
13738 if (OrigXWidth < VTWidth) {
13739 X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
13740 AddToWorklist(X.getNode());
13741 } else if (OrigXWidth > VTWidth) {
13742 // To get the sign bit in the right place, we have to shift it right
13743 // before truncating.
13745 X = DAG.getNode(ISD::SRL, DL,
13746 X.getValueType(), X,
13747 DAG.getConstant(OrigXWidth-VTWidth, DL,
13748 X.getValueType()));
13749 AddToWorklist(X.getNode());
13750 X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
13751 AddToWorklist(X.getNode());
13754 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
13755 APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
13756 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
13757 AddToWorklist(Cst.getNode());
13758 SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
13759 AddToWorklist(X.getNode());
13760 SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
13761 AddToWorklist(XorResult.getNode());
13762 SDValue XorResult64 = DAG.getNode(
13763 ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
13764 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
13765 SDLoc(XorResult)));
13766 AddToWorklist(XorResult64.getNode());
13768 DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
13769 DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
13770 AddToWorklist(FlipBit.getNode());
13772 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
13773 AddToWorklist(FlipBits.getNode());
13774 return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
13776 APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
13777 X = DAG.getNode(ISD::AND, SDLoc(X), VT,
13778 X, DAG.getConstant(SignBit, SDLoc(X), VT));
13779 AddToWorklist(X.getNode());
13781 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
13782 Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
13783 Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
13784 AddToWorklist(Cst.getNode());
13786 return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
13790 // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
13791 if (N0.getOpcode() == ISD::BUILD_PAIR)
13792 if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
13795 // Remove double bitcasts from shuffles - this is often a legacy of
13796 // XformToShuffleWithZero being used to combine bitmaskings (of
13797 // float vectors bitcast to integer vectors) into shuffles.
13798 // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
13799 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
13800 N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() &&
13801 VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
13802 !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
13803 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
13805 // If operands are a bitcast, peek through if it casts the original VT.
13806 // If operands are a constant, just bitcast back to original VT.
13807 auto PeekThroughBitcast = [&](SDValue Op) {
13808 if (Op.getOpcode() == ISD::BITCAST &&
13809 Op.getOperand(0).getValueType() == VT)
13810 return SDValue(Op.getOperand(0));
13811 if (Op.isUndef() || isAnyConstantBuildVector(Op))
13812 return DAG.getBitcast(VT, Op);
13816 // FIXME: If either input vector is bitcast, try to convert the shuffle to
13817 // the result type of this bitcast. This would eliminate at least one
13818 // bitcast. See the transform in InstCombine.
13819 SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
13820 SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
13825 VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
13826 SmallVector<int, 8> NewMask;
13827 for (int M : SVN->getMask())
13828 for (int i = 0; i != MaskScale; ++i)
13829 NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
13831 SDValue LegalShuffle =
13832 TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG);
13834 return LegalShuffle;
13840 SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
13841 EVT VT = N->getValueType(0);
13842 return CombineConsecutiveLoads(N, VT);
13845 SDValue DAGCombiner::visitFREEZE(SDNode *N) {
13846 SDValue N0 = N->getOperand(0);
13848 if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
13851 // Fold freeze(bitcast(x)) -> bitcast(freeze(x)).
13852 // TODO: Replace with pushFreezeToPreventPoisonFromPropagating fold.
13853 if (N0.getOpcode() == ISD::BITCAST)
13854 return DAG.getBitcast(N->getValueType(0),
13855 DAG.getNode(ISD::FREEZE, SDLoc(N0),
13856 N0.getOperand(0).getValueType(),
13857 N0.getOperand(0)));
13862 /// We know that BV is a build_vector node with Constant, ConstantFP or Undef
13863 /// operands. DstEltVT indicates the destination element value type.
13864 SDValue DAGCombiner::
13865 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
13866 EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
13868 // If this is already the right type, we're done.
13869 if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
13871 unsigned SrcBitSize = SrcEltVT.getSizeInBits();
13872 unsigned DstBitSize = DstEltVT.getSizeInBits();
13874 // If this is a conversion of N elements of one type to N elements of another
13875 // type, convert each element. This handles FP<->INT cases.
13876 if (SrcBitSize == DstBitSize) {
13877 SmallVector<SDValue, 8> Ops;
13878 for (SDValue Op : BV->op_values()) {
13879 // If the vector element type is not legal, the BUILD_VECTOR operands
13880 // are promoted and implicitly truncated. Make that explicit here.
13881 if (Op.getValueType() != SrcEltVT)
13882 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
13883 Ops.push_back(DAG.getBitcast(DstEltVT, Op));
13884 AddToWorklist(Ops.back().getNode());
13886 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
13887 BV->getValueType(0).getVectorNumElements());
13888 return DAG.getBuildVector(VT, SDLoc(BV), Ops);
13891 // Otherwise, we're growing or shrinking the elements. To avoid having to
13892 // handle annoying details of growing/shrinking FP values, we convert them to
13894 if (SrcEltVT.isFloatingPoint()) {
13895 // Convert the input float vector to a int vector where the elements are the
13897 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
13898 BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
13902 // Now we know the input is an integer vector. If the output is a FP type,
13903 // convert to integer first, then to FP of the right size.
13904 if (DstEltVT.isFloatingPoint()) {
13905 EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
13906 SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
13908 // Next, convert to FP elements of the same size.
13909 return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
13912 // Okay, we know the src/dst types are both integers of differing types.
13913 assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
13915 // TODO: Should ConstantFoldBITCASTofBUILD_VECTOR always take a
13916 // BuildVectorSDNode?
13917 auto *BVN = cast<BuildVectorSDNode>(BV);
13919 // Extract the constant raw bit data.
13920 BitVector UndefElements;
13921 SmallVector<APInt> RawBits;
13922 bool IsLE = DAG.getDataLayout().isLittleEndian();
13923 if (!BVN->getConstantRawBits(IsLE, DstBitSize, RawBits, UndefElements))
13927 SmallVector<SDValue, 8> Ops;
13928 for (unsigned I = 0, E = RawBits.size(); I != E; ++I) {
13929 if (UndefElements[I])
13930 Ops.push_back(DAG.getUNDEF(DstEltVT));
13932 Ops.push_back(DAG.getConstant(RawBits[I], DL, DstEltVT));
13935 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
13936 return DAG.getBuildVector(VT, DL, Ops);
13939 // Returns true if floating point contraction is allowed on the FMUL-SDValue
13941 static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
13942 assert(N.getOpcode() == ISD::FMUL);
13944 return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13945 N->getFlags().hasAllowContract();
13948 // Returns true if `N` can assume no infinities involved in its computation.
13949 static bool hasNoInfs(const TargetOptions &Options, SDValue N) {
13950 return Options.NoInfsFPMath || N->getFlags().hasNoInfs();
13953 /// Try to perform FMA combining on a given FADD node.
13954 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
13955 SDValue N0 = N->getOperand(0);
13956 SDValue N1 = N->getOperand(1);
13957 EVT VT = N->getValueType(0);
13960 const TargetOptions &Options = DAG.getTarget().Options;
13962 // Floating-point multiply-add with intermediate rounding.
13963 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
13965 // Floating-point multiply-add without intermediate rounding.
13967 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
13968 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
13970 // No valid opcode, do not combine.
13971 if (!HasFMAD && !HasFMA)
13974 bool CanReassociate =
13975 Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
13976 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
13977 Options.UnsafeFPMath || HasFMAD);
13978 // If the addition is not contractable, do not combine.
13979 if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
13982 if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
13985 // Always prefer FMAD to FMA for precision.
13986 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
13987 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
13989 auto isFusedOp = [&](SDValue N) {
13990 unsigned Opcode = N.getOpcode();
13991 return Opcode == ISD::FMA || Opcode == ISD::FMAD;
13994 // Is the node an FMUL and contractable either due to global flags or
13996 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
13997 if (N.getOpcode() != ISD::FMUL)
13999 return AllowFusionGlobally || N->getFlags().hasAllowContract();
14001 // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
14002 // prefer to fold the multiply with fewer uses.
14003 if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
14004 if (N0->use_size() > N1->use_size())
14008 // fold (fadd (fmul x, y), z) -> (fma x, y, z)
14009 if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
14010 return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
14011 N0.getOperand(1), N1);
14014 // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
14015 // Note: Commutes FADD operands.
14016 if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
14017 return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0),
14018 N1.getOperand(1), N0);
14021 // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)
14022 // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)
14023 // This requires reassociation because it changes the order of operations.
14025 if (CanReassociate && isFusedOp(N0) &&
14026 N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() &&
14027 N0.getOperand(2).hasOneUse()) {
14030 } else if (CanReassociate && isFusedOp(N1) &&
14031 N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() &&
14032 N1.getOperand(2).hasOneUse()) {
14037 SDValue A = FMA.getOperand(0);
14038 SDValue B = FMA.getOperand(1);
14039 SDValue C = FMA.getOperand(2).getOperand(0);
14040 SDValue D = FMA.getOperand(2).getOperand(1);
14041 SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E);
14042 return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE);
14045 // Look through FP_EXTEND nodes to do more combining.
14047 // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
14048 if (N0.getOpcode() == ISD::FP_EXTEND) {
14049 SDValue N00 = N0.getOperand(0);
14050 if (isContractableFMUL(N00) &&
14051 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14052 N00.getValueType())) {
14053 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14054 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
14055 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
14060 // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
14061 // Note: Commutes FADD operands.
14062 if (N1.getOpcode() == ISD::FP_EXTEND) {
14063 SDValue N10 = N1.getOperand(0);
14064 if (isContractableFMUL(N10) &&
14065 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14066 N10.getValueType())) {
14067 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14068 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)),
14069 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)),
14074 // More folding opportunities when target permits.
14076 // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
14077 // -> (fma x, y, (fma (fpext u), (fpext v), z))
14078 auto FoldFAddFMAFPExtFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
14080 return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
14081 DAG.getNode(PreferredFusedOpcode, SL, VT,
14082 DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
14083 DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
14086 if (isFusedOp(N0)) {
14087 SDValue N02 = N0.getOperand(2);
14088 if (N02.getOpcode() == ISD::FP_EXTEND) {
14089 SDValue N020 = N02.getOperand(0);
14090 if (isContractableFMUL(N020) &&
14091 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14092 N020.getValueType())) {
14093 return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
14094 N020.getOperand(0), N020.getOperand(1),
14100 // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
14101 // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
14102 // FIXME: This turns two single-precision and one double-precision
14103 // operation into two double-precision operations, which might not be
14104 // interesting for all targets, especially GPUs.
14105 auto FoldFAddFPExtFMAFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
14107 return DAG.getNode(
14108 PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
14109 DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
14110 DAG.getNode(PreferredFusedOpcode, SL, VT,
14111 DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
14112 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z));
14114 if (N0.getOpcode() == ISD::FP_EXTEND) {
14115 SDValue N00 = N0.getOperand(0);
14116 if (isFusedOp(N00)) {
14117 SDValue N002 = N00.getOperand(2);
14118 if (isContractableFMUL(N002) &&
14119 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14120 N00.getValueType())) {
14121 return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
14122 N002.getOperand(0), N002.getOperand(1),
14128 // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
14129 // -> (fma y, z, (fma (fpext u), (fpext v), x))
14130 if (isFusedOp(N1)) {
14131 SDValue N12 = N1.getOperand(2);
14132 if (N12.getOpcode() == ISD::FP_EXTEND) {
14133 SDValue N120 = N12.getOperand(0);
14134 if (isContractableFMUL(N120) &&
14135 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14136 N120.getValueType())) {
14137 return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
14138 N120.getOperand(0), N120.getOperand(1),
14144 // fold (fadd x, (fpext (fma y, z, (fmul u, v)))
14145 // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
14146 // FIXME: This turns two single-precision and one double-precision
14147 // operation into two double-precision operations, which might not be
14148 // interesting for all targets, especially GPUs.
14149 if (N1.getOpcode() == ISD::FP_EXTEND) {
14150 SDValue N10 = N1.getOperand(0);
14151 if (isFusedOp(N10)) {
14152 SDValue N102 = N10.getOperand(2);
14153 if (isContractableFMUL(N102) &&
14154 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14155 N10.getValueType())) {
14156 return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
14157 N102.getOperand(0), N102.getOperand(1),
14167 /// Try to perform FMA combining on a given FSUB node.
14168 SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
14169 SDValue N0 = N->getOperand(0);
14170 SDValue N1 = N->getOperand(1);
14171 EVT VT = N->getValueType(0);
14174 const TargetOptions &Options = DAG.getTarget().Options;
14175 // Floating-point multiply-add with intermediate rounding.
14176 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
14178 // Floating-point multiply-add without intermediate rounding.
14180 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
14181 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
14183 // No valid opcode, do not combine.
14184 if (!HasFMAD && !HasFMA)
14187 const SDNodeFlags Flags = N->getFlags();
14188 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
14189 Options.UnsafeFPMath || HasFMAD);
14191 // If the subtraction is not contractable, do not combine.
14192 if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
14195 if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
14198 // Always prefer FMAD to FMA for precision.
14199 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
14200 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
14201 bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
14203 // Is the node an FMUL and contractable either due to global flags or
14205 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
14206 if (N.getOpcode() != ISD::FMUL)
14208 return AllowFusionGlobally || N->getFlags().hasAllowContract();
14211 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
14212 auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
14213 if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
14214 return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
14215 XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z));
14220 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
14221 // Note: Commutes FSUB operands.
14222 auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
14223 if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
14224 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14225 DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
14226 YZ.getOperand(1), X);
14231 // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
14232 // prefer to fold the multiply with fewer uses.
14233 if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
14234 (N0->use_size() > N1->use_size())) {
14235 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
14236 if (SDValue V = tryToFoldXSubYZ(N0, N1))
14238 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d)))
14239 if (SDValue V = tryToFoldXYSubZ(N0, N1))
14242 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
14243 if (SDValue V = tryToFoldXYSubZ(N0, N1))
14245 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
14246 if (SDValue V = tryToFoldXSubYZ(N0, N1))
14250 // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
14251 if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
14252 (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
14253 SDValue N00 = N0.getOperand(0).getOperand(0);
14254 SDValue N01 = N0.getOperand(0).getOperand(1);
14255 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14256 DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
14257 DAG.getNode(ISD::FNEG, SL, VT, N1));
14260 // Look through FP_EXTEND nodes to do more combining.
14262 // fold (fsub (fpext (fmul x, y)), z)
14263 // -> (fma (fpext x), (fpext y), (fneg z))
14264 if (N0.getOpcode() == ISD::FP_EXTEND) {
14265 SDValue N00 = N0.getOperand(0);
14266 if (isContractableFMUL(N00) &&
14267 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14268 N00.getValueType())) {
14269 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14270 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
14271 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
14272 DAG.getNode(ISD::FNEG, SL, VT, N1));
14276 // fold (fsub x, (fpext (fmul y, z)))
14277 // -> (fma (fneg (fpext y)), (fpext z), x)
14278 // Note: Commutes FSUB operands.
14279 if (N1.getOpcode() == ISD::FP_EXTEND) {
14280 SDValue N10 = N1.getOperand(0);
14281 if (isContractableFMUL(N10) &&
14282 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14283 N10.getValueType())) {
14284 return DAG.getNode(
14285 PreferredFusedOpcode, SL, VT,
14286 DAG.getNode(ISD::FNEG, SL, VT,
14287 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))),
14288 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0);
14292 // fold (fsub (fpext (fneg (fmul, x, y))), z)
14293 // -> (fneg (fma (fpext x), (fpext y), z))
14294 // Note: This could be removed with appropriate canonicalization of the
14295 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
14296 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
14297 // from implementing the canonicalization in visitFSUB.
14298 if (N0.getOpcode() == ISD::FP_EXTEND) {
14299 SDValue N00 = N0.getOperand(0);
14300 if (N00.getOpcode() == ISD::FNEG) {
14301 SDValue N000 = N00.getOperand(0);
14302 if (isContractableFMUL(N000) &&
14303 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14304 N00.getValueType())) {
14305 return DAG.getNode(
14307 DAG.getNode(PreferredFusedOpcode, SL, VT,
14308 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
14309 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
14315 // fold (fsub (fneg (fpext (fmul, x, y))), z)
14316 // -> (fneg (fma (fpext x)), (fpext y), z)
14317 // Note: This could be removed with appropriate canonicalization of the
14318 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
14319 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
14320 // from implementing the canonicalization in visitFSUB.
14321 if (N0.getOpcode() == ISD::FNEG) {
14322 SDValue N00 = N0.getOperand(0);
14323 if (N00.getOpcode() == ISD::FP_EXTEND) {
14324 SDValue N000 = N00.getOperand(0);
14325 if (isContractableFMUL(N000) &&
14326 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14327 N000.getValueType())) {
14328 return DAG.getNode(
14330 DAG.getNode(PreferredFusedOpcode, SL, VT,
14331 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
14332 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
14338 auto isReassociable = [Options](SDNode *N) {
14339 return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
14342 auto isContractableAndReassociableFMUL = [isContractableFMUL,
14343 isReassociable](SDValue N) {
14344 return isContractableFMUL(N) && isReassociable(N.getNode());
14347 auto isFusedOp = [&](SDValue N) {
14348 unsigned Opcode = N.getOpcode();
14349 return Opcode == ISD::FMA || Opcode == ISD::FMAD;
14352 // More folding opportunities when target permits.
14353 if (Aggressive && isReassociable(N)) {
14354 bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract();
14355 // fold (fsub (fma x, y, (fmul u, v)), z)
14356 // -> (fma x, y (fma u, v, (fneg z)))
14357 if (CanFuse && isFusedOp(N0) &&
14358 isContractableAndReassociableFMUL(N0.getOperand(2)) &&
14359 N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
14360 return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
14362 DAG.getNode(PreferredFusedOpcode, SL, VT,
14363 N0.getOperand(2).getOperand(0),
14364 N0.getOperand(2).getOperand(1),
14365 DAG.getNode(ISD::FNEG, SL, VT, N1)));
14368 // fold (fsub x, (fma y, z, (fmul u, v)))
14369 // -> (fma (fneg y), z, (fma (fneg u), v, x))
14370 if (CanFuse && isFusedOp(N1) &&
14371 isContractableAndReassociableFMUL(N1.getOperand(2)) &&
14372 N1->hasOneUse() && NoSignedZero) {
14373 SDValue N20 = N1.getOperand(2).getOperand(0);
14374 SDValue N21 = N1.getOperand(2).getOperand(1);
14375 return DAG.getNode(
14376 PreferredFusedOpcode, SL, VT,
14377 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1),
14378 DAG.getNode(PreferredFusedOpcode, SL, VT,
14379 DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0));
14382 // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
14383 // -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
14384 if (isFusedOp(N0) && N0->hasOneUse()) {
14385 SDValue N02 = N0.getOperand(2);
14386 if (N02.getOpcode() == ISD::FP_EXTEND) {
14387 SDValue N020 = N02.getOperand(0);
14388 if (isContractableAndReassociableFMUL(N020) &&
14389 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14390 N020.getValueType())) {
14391 return DAG.getNode(
14392 PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1),
14394 PreferredFusedOpcode, SL, VT,
14395 DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)),
14396 DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)),
14397 DAG.getNode(ISD::FNEG, SL, VT, N1)));
14402 // fold (fsub (fpext (fma x, y, (fmul u, v))), z)
14403 // -> (fma (fpext x), (fpext y),
14404 // (fma (fpext u), (fpext v), (fneg z)))
14405 // FIXME: This turns two single-precision and one double-precision
14406 // operation into two double-precision operations, which might not be
14407 // interesting for all targets, especially GPUs.
14408 if (N0.getOpcode() == ISD::FP_EXTEND) {
14409 SDValue N00 = N0.getOperand(0);
14410 if (isFusedOp(N00)) {
14411 SDValue N002 = N00.getOperand(2);
14412 if (isContractableAndReassociableFMUL(N002) &&
14413 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14414 N00.getValueType())) {
14415 return DAG.getNode(
14416 PreferredFusedOpcode, SL, VT,
14417 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
14418 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
14420 PreferredFusedOpcode, SL, VT,
14421 DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)),
14422 DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)),
14423 DAG.getNode(ISD::FNEG, SL, VT, N1)));
14428 // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
14429 // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
14430 if (isFusedOp(N1) && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND &&
14432 SDValue N120 = N1.getOperand(2).getOperand(0);
14433 if (isContractableAndReassociableFMUL(N120) &&
14434 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14435 N120.getValueType())) {
14436 SDValue N1200 = N120.getOperand(0);
14437 SDValue N1201 = N120.getOperand(1);
14438 return DAG.getNode(
14439 PreferredFusedOpcode, SL, VT,
14440 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1),
14441 DAG.getNode(PreferredFusedOpcode, SL, VT,
14442 DAG.getNode(ISD::FNEG, SL, VT,
14443 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)),
14444 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0));
14448 // fold (fsub x, (fpext (fma y, z, (fmul u, v))))
14449 // -> (fma (fneg (fpext y)), (fpext z),
14450 // (fma (fneg (fpext u)), (fpext v), x))
14451 // FIXME: This turns two single-precision and one double-precision
14452 // operation into two double-precision operations, which might not be
14453 // interesting for all targets, especially GPUs.
14454 if (N1.getOpcode() == ISD::FP_EXTEND && isFusedOp(N1.getOperand(0))) {
14455 SDValue CvtSrc = N1.getOperand(0);
14456 SDValue N100 = CvtSrc.getOperand(0);
14457 SDValue N101 = CvtSrc.getOperand(1);
14458 SDValue N102 = CvtSrc.getOperand(2);
14459 if (isContractableAndReassociableFMUL(N102) &&
14460 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14461 CvtSrc.getValueType())) {
14462 SDValue N1020 = N102.getOperand(0);
14463 SDValue N1021 = N102.getOperand(1);
14464 return DAG.getNode(
14465 PreferredFusedOpcode, SL, VT,
14466 DAG.getNode(ISD::FNEG, SL, VT,
14467 DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)),
14468 DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
14469 DAG.getNode(PreferredFusedOpcode, SL, VT,
14470 DAG.getNode(ISD::FNEG, SL, VT,
14471 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)),
14472 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0));
14480 /// Try to perform FMA combining on a given FMUL node based on the distributive
14481 /// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
14482 /// subtraction instead of addition).
14483 SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
14484 SDValue N0 = N->getOperand(0);
14485 SDValue N1 = N->getOperand(1);
14486 EVT VT = N->getValueType(0);
14489 assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
14491 const TargetOptions &Options = DAG.getTarget().Options;
14493 // The transforms below are incorrect when x == 0 and y == inf, because the
14494 // intermediate multiplication produces a nan.
14495 SDValue FAdd = N0.getOpcode() == ISD::FADD ? N0 : N1;
14496 if (!hasNoInfs(Options, FAdd))
14499 // Floating-point multiply-add without intermediate rounding.
14501 isContractableFMUL(Options, SDValue(N, 0)) &&
14502 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
14503 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
14505 // Floating-point multiply-add with intermediate rounding. This can result
14506 // in a less precise result due to the changed rounding order.
14507 bool HasFMAD = Options.UnsafeFPMath &&
14508 (LegalOperations && TLI.isFMADLegal(DAG, N));
14510 // No valid opcode, do not combine.
14511 if (!HasFMAD && !HasFMA)
14514 // Always prefer FMAD to FMA for precision.
14515 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
14516 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
14518 // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
14519 // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
14520 auto FuseFADD = [&](SDValue X, SDValue Y) {
14521 if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
14522 if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
14523 if (C->isExactlyValue(+1.0))
14524 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14526 if (C->isExactlyValue(-1.0))
14527 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14528 DAG.getNode(ISD::FNEG, SL, VT, Y));
14534 if (SDValue FMA = FuseFADD(N0, N1))
14536 if (SDValue FMA = FuseFADD(N1, N0))
14539 // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
14540 // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
14541 // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
14542 // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
14543 auto FuseFSUB = [&](SDValue X, SDValue Y) {
14544 if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
14545 if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
14546 if (C0->isExactlyValue(+1.0))
14547 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14548 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
14550 if (C0->isExactlyValue(-1.0))
14551 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14552 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
14553 DAG.getNode(ISD::FNEG, SL, VT, Y));
14555 if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
14556 if (C1->isExactlyValue(+1.0))
14557 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14558 DAG.getNode(ISD::FNEG, SL, VT, Y));
14559 if (C1->isExactlyValue(-1.0))
14560 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14567 if (SDValue FMA = FuseFSUB(N0, N1))
14569 if (SDValue FMA = FuseFSUB(N1, N0))
14575 SDValue DAGCombiner::visitFADD(SDNode *N) {
14576 SDValue N0 = N->getOperand(0);
14577 SDValue N1 = N->getOperand(1);
14578 bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
14579 bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
14580 EVT VT = N->getValueType(0);
14582 const TargetOptions &Options = DAG.getTarget().Options;
14583 SDNodeFlags Flags = N->getFlags();
14584 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14586 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
14589 // fold (fadd c1, c2) -> c1 + c2
14590 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FADD, DL, VT, {N0, N1}))
14593 // canonicalize constant to RHS
14594 if (N0CFP && !N1CFP)
14595 return DAG.getNode(ISD::FADD, DL, VT, N1, N0);
14599 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
14602 // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
14603 ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
14604 if (N1C && N1C->isZero())
14605 if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())
14608 if (SDValue NewSel = foldBinOpIntoSelect(N))
14611 // fold (fadd A, (fneg B)) -> (fsub A, B)
14612 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
14613 if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
14614 N1, DAG, LegalOperations, ForCodeSize))
14615 return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1);
14617 // fold (fadd (fneg A), B) -> (fsub B, A)
14618 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
14619 if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
14620 N0, DAG, LegalOperations, ForCodeSize))
14621 return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0);
14623 auto isFMulNegTwo = [](SDValue FMul) {
14624 if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
14626 auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true);
14627 return C && C->isExactlyValue(-2.0);
14630 // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
14631 if (isFMulNegTwo(N0)) {
14632 SDValue B = N0.getOperand(0);
14633 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
14634 return DAG.getNode(ISD::FSUB, DL, VT, N1, Add);
14636 // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
14637 if (isFMulNegTwo(N1)) {
14638 SDValue B = N1.getOperand(0);
14639 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
14640 return DAG.getNode(ISD::FSUB, DL, VT, N0, Add);
14643 // No FP constant should be created after legalization as Instruction
14644 // Selection pass has a hard time dealing with FP constants.
14645 bool AllowNewConst = (Level < AfterLegalizeDAG);
14647 // If nnan is enabled, fold lots of things.
14648 if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
14649 // If allowed, fold (fadd (fneg x), x) -> 0.0
14650 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
14651 return DAG.getConstantFP(0.0, DL, VT);
14653 // If allowed, fold (fadd x, (fneg x)) -> 0.0
14654 if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
14655 return DAG.getConstantFP(0.0, DL, VT);
14658 // If 'unsafe math' or reassoc and nsz, fold lots of things.
14659 // TODO: break out portions of the transformations below for which Unsafe is
14660 // considered and which do not require both nsz and reassoc
14661 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
14662 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
14664 // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
14665 if (N1CFP && N0.getOpcode() == ISD::FADD &&
14666 DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
14667 SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1);
14668 return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC);
14671 // We can fold chains of FADD's of the same value into multiplications.
14672 // This transform is not safe in general because we are reducing the number
14673 // of rounding steps.
14674 if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
14675 if (N0.getOpcode() == ISD::FMUL) {
14676 bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
14677 bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
14679 // (fadd (fmul x, c), x) -> (fmul x, c+1)
14680 if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
14681 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
14682 DAG.getConstantFP(1.0, DL, VT));
14683 return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP);
14686 // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
14687 if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
14688 N1.getOperand(0) == N1.getOperand(1) &&
14689 N0.getOperand(0) == N1.getOperand(0)) {
14690 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
14691 DAG.getConstantFP(2.0, DL, VT));
14692 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP);
14696 if (N1.getOpcode() == ISD::FMUL) {
14697 bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
14698 bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
14700 // (fadd x, (fmul x, c)) -> (fmul x, c+1)
14701 if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
14702 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
14703 DAG.getConstantFP(1.0, DL, VT));
14704 return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP);
14707 // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
14708 if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
14709 N0.getOperand(0) == N0.getOperand(1) &&
14710 N1.getOperand(0) == N0.getOperand(0)) {
14711 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
14712 DAG.getConstantFP(2.0, DL, VT));
14713 return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP);
14717 if (N0.getOpcode() == ISD::FADD) {
14718 bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
14719 // (fadd (fadd x, x), x) -> (fmul x, 3.0)
14720 if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
14721 (N0.getOperand(0) == N1)) {
14722 return DAG.getNode(ISD::FMUL, DL, VT, N1,
14723 DAG.getConstantFP(3.0, DL, VT));
14727 if (N1.getOpcode() == ISD::FADD) {
14728 bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
14729 // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
14730 if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
14731 N1.getOperand(0) == N0) {
14732 return DAG.getNode(ISD::FMUL, DL, VT, N0,
14733 DAG.getConstantFP(3.0, DL, VT));
14737 // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
14738 if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
14739 N0.getOperand(0) == N0.getOperand(1) &&
14740 N1.getOperand(0) == N1.getOperand(1) &&
14741 N0.getOperand(0) == N1.getOperand(0)) {
14742 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
14743 DAG.getConstantFP(4.0, DL, VT));
14746 } // enable-unsafe-fp-math
14748 // FADD -> FMA combines:
14749 if (SDValue Fused = visitFADDForFMACombine(N)) {
14750 AddToWorklist(Fused.getNode());
14756 SDValue DAGCombiner::visitSTRICT_FADD(SDNode *N) {
14757 SDValue Chain = N->getOperand(0);
14758 SDValue N0 = N->getOperand(1);
14759 SDValue N1 = N->getOperand(2);
14760 EVT VT = N->getValueType(0);
14761 EVT ChainVT = N->getValueType(1);
14763 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14765 // fold (strict_fadd A, (fneg B)) -> (strict_fsub A, B)
14766 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
14767 if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
14768 N1, DAG, LegalOperations, ForCodeSize)) {
14769 return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
14770 {Chain, N0, NegN1});
14773 // fold (strict_fadd (fneg A), B) -> (strict_fsub B, A)
14774 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
14775 if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
14776 N0, DAG, LegalOperations, ForCodeSize)) {
14777 return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
14778 {Chain, N1, NegN0});
14783 SDValue DAGCombiner::visitFSUB(SDNode *N) {
14784 SDValue N0 = N->getOperand(0);
14785 SDValue N1 = N->getOperand(1);
14786 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
14787 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
14788 EVT VT = N->getValueType(0);
14790 const TargetOptions &Options = DAG.getTarget().Options;
14791 const SDNodeFlags Flags = N->getFlags();
14792 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14794 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
14797 // fold (fsub c1, c2) -> c1-c2
14798 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FSUB, DL, VT, {N0, N1}))
14803 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
14806 if (SDValue NewSel = foldBinOpIntoSelect(N))
14809 // (fsub A, 0) -> A
14810 if (N1CFP && N1CFP->isZero()) {
14811 if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
14812 Flags.hasNoSignedZeros()) {
14818 // (fsub x, x) -> 0.0
14819 if (Options.NoNaNsFPMath || Flags.hasNoNaNs())
14820 return DAG.getConstantFP(0.0f, DL, VT);
14823 // (fsub -0.0, N1) -> -N1
14824 if (N0CFP && N0CFP->isZero()) {
14825 if (N0CFP->isNegative() ||
14826 (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
14827 // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
14828 // flushed to zero, unless all users treat denorms as zero (DAZ).
14829 // FIXME: This transform will change the sign of a NaN and the behavior
14830 // of a signaling NaN. It is only valid when a NoNaN flag is present.
14831 DenormalMode DenormMode = DAG.getDenormalMode(VT);
14832 if (DenormMode == DenormalMode::getIEEE()) {
14833 if (SDValue NegN1 =
14834 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
14836 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
14837 return DAG.getNode(ISD::FNEG, DL, VT, N1);
14842 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
14843 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
14844 N1.getOpcode() == ISD::FADD) {
14845 // X - (X + Y) -> -Y
14846 if (N0 == N1->getOperand(0))
14847 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1));
14848 // X - (Y + X) -> -Y
14849 if (N0 == N1->getOperand(1))
14850 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0));
14853 // fold (fsub A, (fneg B)) -> (fadd A, B)
14854 if (SDValue NegN1 =
14855 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
14856 return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1);
14858 // FSUB -> FMA combines:
14859 if (SDValue Fused = visitFSUBForFMACombine(N)) {
14860 AddToWorklist(Fused.getNode());
14867 SDValue DAGCombiner::visitFMUL(SDNode *N) {
14868 SDValue N0 = N->getOperand(0);
14869 SDValue N1 = N->getOperand(1);
14870 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
14871 EVT VT = N->getValueType(0);
14873 const TargetOptions &Options = DAG.getTarget().Options;
14874 const SDNodeFlags Flags = N->getFlags();
14875 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14877 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
14880 // fold (fmul c1, c2) -> c1*c2
14881 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FMUL, DL, VT, {N0, N1}))
14884 // canonicalize constant to RHS
14885 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
14886 !DAG.isConstantFPBuildVectorOrConstantFP(N1))
14887 return DAG.getNode(ISD::FMUL, DL, VT, N1, N0);
14891 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
14894 if (SDValue NewSel = foldBinOpIntoSelect(N))
14897 if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) {
14898 // fmul (fmul X, C1), C2 -> fmul X, C1 * C2
14899 if (DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
14900 N0.getOpcode() == ISD::FMUL) {
14901 SDValue N00 = N0.getOperand(0);
14902 SDValue N01 = N0.getOperand(1);
14903 // Avoid an infinite loop by making sure that N00 is not a constant
14904 // (the inner multiply has not been constant folded yet).
14905 if (DAG.isConstantFPBuildVectorOrConstantFP(N01) &&
14906 !DAG.isConstantFPBuildVectorOrConstantFP(N00)) {
14907 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1);
14908 return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts);
14912 // Match a special-case: we convert X * 2.0 into fadd.
14913 // fmul (fadd X, X), C -> fmul X, 2.0 * C
14914 if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
14915 N0.getOperand(0) == N0.getOperand(1)) {
14916 const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
14917 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
14918 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
14922 // fold (fmul X, 2.0) -> (fadd X, X)
14923 if (N1CFP && N1CFP->isExactlyValue(+2.0))
14924 return DAG.getNode(ISD::FADD, DL, VT, N0, N0);
14926 // fold (fmul X, -1.0) -> (fsub -0.0, X)
14927 if (N1CFP && N1CFP->isExactlyValue(-1.0)) {
14928 if (!LegalOperations || TLI.isOperationLegal(ISD::FSUB, VT)) {
14929 return DAG.getNode(ISD::FSUB, DL, VT,
14930 DAG.getConstantFP(-0.0, DL, VT), N0, Flags);
14934 // -N0 * -N1 --> N0 * N1
14935 TargetLowering::NegatibleCost CostN0 =
14936 TargetLowering::NegatibleCost::Expensive;
14937 TargetLowering::NegatibleCost CostN1 =
14938 TargetLowering::NegatibleCost::Expensive;
14940 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
14942 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
14943 if (NegN0 && NegN1 &&
14944 (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
14945 CostN1 == TargetLowering::NegatibleCost::Cheaper))
14946 return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1);
14948 // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
14949 // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
14950 if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
14951 (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) &&
14952 TLI.isOperationLegal(ISD::FABS, VT)) {
14953 SDValue Select = N0, X = N1;
14954 if (Select.getOpcode() != ISD::SELECT)
14955 std::swap(Select, X);
14957 SDValue Cond = Select.getOperand(0);
14958 auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
14959 auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));
14961 if (TrueOpnd && FalseOpnd &&
14962 Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
14963 isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
14964 cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
14965 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
14974 std::swap(TrueOpnd, FalseOpnd);
14982 if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
14983 TLI.isOperationLegal(ISD::FNEG, VT))
14984 return DAG.getNode(ISD::FNEG, DL, VT,
14985 DAG.getNode(ISD::FABS, DL, VT, X));
14986 if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
14987 return DAG.getNode(ISD::FABS, DL, VT, X);
14994 // FMUL -> FMA combines:
14995 if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
14996 AddToWorklist(Fused.getNode());
15003 SDValue DAGCombiner::visitFMA(SDNode *N) {
15004 SDValue N0 = N->getOperand(0);
15005 SDValue N1 = N->getOperand(1);
15006 SDValue N2 = N->getOperand(2);
15007 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
15008 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
15009 EVT VT = N->getValueType(0);
15011 const TargetOptions &Options = DAG.getTarget().Options;
15012 // FMA nodes have flags that propagate to the created nodes.
15013 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15015 bool CanReassociate =
15016 Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
15018 // Constant fold FMA.
15019 if (isa<ConstantFPSDNode>(N0) &&
15020 isa<ConstantFPSDNode>(N1) &&
15021 isa<ConstantFPSDNode>(N2)) {
15022 return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
15025 // (-N0 * -N1) + N2 --> (N0 * N1) + N2
15026 TargetLowering::NegatibleCost CostN0 =
15027 TargetLowering::NegatibleCost::Expensive;
15028 TargetLowering::NegatibleCost CostN1 =
15029 TargetLowering::NegatibleCost::Expensive;
15031 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
15033 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
15034 if (NegN0 && NegN1 &&
15035 (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
15036 CostN1 == TargetLowering::NegatibleCost::Cheaper))
15037 return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
15039 // FIXME: use fast math flags instead of Options.UnsafeFPMath
15040 if (Options.UnsafeFPMath) {
15041 if (N0CFP && N0CFP->isZero())
15043 if (N1CFP && N1CFP->isZero())
15047 if (N0CFP && N0CFP->isExactlyValue(1.0))
15048 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
15049 if (N1CFP && N1CFP->isExactlyValue(1.0))
15050 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
15052 // Canonicalize (fma c, x, y) -> (fma x, c, y)
15053 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
15054 !DAG.isConstantFPBuildVectorOrConstantFP(N1))
15055 return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
15057 if (CanReassociate) {
15058 // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
15059 if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
15060 DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
15061 DAG.isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
15062 return DAG.getNode(ISD::FMUL, DL, VT, N0,
15063 DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1)));
15066 // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
15067 if (N0.getOpcode() == ISD::FMUL &&
15068 DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
15069 DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
15070 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
15071 DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)),
15076 // (fma x, -1, y) -> (fadd (fneg x), y)
15078 if (N1CFP->isExactlyValue(1.0))
15079 return DAG.getNode(ISD::FADD, DL, VT, N0, N2);
15081 if (N1CFP->isExactlyValue(-1.0) &&
15082 (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
15083 SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
15084 AddToWorklist(RHSNeg.getNode());
15085 return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
15088 // fma (fneg x), K, y -> fma x -K, y
15089 if (N0.getOpcode() == ISD::FNEG &&
15090 (TLI.isOperationLegal(ISD::ConstantFP, VT) ||
15091 (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT,
15093 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
15094 DAG.getNode(ISD::FNEG, DL, VT, N1), N2);
15098 if (CanReassociate) {
15099 // (fma x, c, x) -> (fmul x, (c+1))
15100 if (N1CFP && N0 == N2) {
15101 return DAG.getNode(
15102 ISD::FMUL, DL, VT, N0,
15103 DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT)));
15106 // (fma x, c, (fneg x)) -> (fmul x, (c-1))
15107 if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
15108 return DAG.getNode(
15109 ISD::FMUL, DL, VT, N0,
15110 DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT)));
15114 // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
15115 // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
15116 if (!TLI.isFNegFree(VT))
15117 if (SDValue Neg = TLI.getCheaperNegatedExpression(
15118 SDValue(N, 0), DAG, LegalOperations, ForCodeSize))
15119 return DAG.getNode(ISD::FNEG, DL, VT, Neg);
15123 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
15125 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
15126 // Notice that this is not always beneficial. One reason is different targets
15127 // may have different costs for FDIV and FMUL, so sometimes the cost of two
15128 // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
15129 // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
15130 SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
15131 // TODO: Limit this transform based on optsize/minsize - it always creates at
15132 // least 1 extra instruction. But the perf win may be substantial enough
15133 // that only minsize should restrict this.
15134 bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
15135 const SDNodeFlags Flags = N->getFlags();
15136 if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal()))
15139 // Skip if current node is a reciprocal/fneg-reciprocal.
15140 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
15141 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
15142 if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
15145 // Exit early if the target does not want this transform or if there can't
15146 // possibly be enough uses of the divisor to make the transform worthwhile.
15147 unsigned MinUses = TLI.combineRepeatedFPDivisors();
15149 // For splat vectors, scale the number of uses by the splat factor. If we can
15150 // convert the division into a scalar op, that will likely be much faster.
15151 unsigned NumElts = 1;
15152 EVT VT = N->getValueType(0);
15153 if (VT.isVector() && DAG.isSplatValue(N1))
15154 NumElts = VT.getVectorMinNumElements();
15156 if (!MinUses || (N1->use_size() * NumElts) < MinUses)
15159 // Find all FDIV users of the same divisor.
15160 // Use a set because duplicates may be present in the user list.
15161 SetVector<SDNode *> Users;
15162 for (auto *U : N1->uses()) {
15163 if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
15164 // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
15165 if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
15166 U->getOperand(0) == U->getOperand(1).getOperand(0) &&
15167 U->getFlags().hasAllowReassociation() &&
15168 U->getFlags().hasNoSignedZeros())
15171 // This division is eligible for optimization only if global unsafe math
15172 // is enabled or if this division allows reciprocal formation.
15173 if (UnsafeMath || U->getFlags().hasAllowReciprocal())
15178 // Now that we have the actual number of divisor uses, make sure it meets
15179 // the minimum threshold specified by the target.
15180 if ((Users.size() * NumElts) < MinUses)
15184 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
15185 SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);
15187 // Dividend / Divisor -> Dividend * Reciprocal
15188 for (auto *U : Users) {
15189 SDValue Dividend = U->getOperand(0);
15190 if (Dividend != FPOne) {
15191 SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
15192 Reciprocal, Flags);
15193 CombineTo(U, NewNode);
15194 } else if (U != Reciprocal.getNode()) {
15195 // In the absence of fast-math-flags, this user node is always the
15196 // same node as Reciprocal, but with FMF they may be different nodes.
15197 CombineTo(U, Reciprocal);
15200 return SDValue(N, 0); // N was replaced.
15203 SDValue DAGCombiner::visitFDIV(SDNode *N) {
15204 SDValue N0 = N->getOperand(0);
15205 SDValue N1 = N->getOperand(1);
15206 EVT VT = N->getValueType(0);
15208 const TargetOptions &Options = DAG.getTarget().Options;
15209 SDNodeFlags Flags = N->getFlags();
15210 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15212 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
15215 // fold (fdiv c1, c2) -> c1/c2
15216 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FDIV, DL, VT, {N0, N1}))
15221 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
15224 if (SDValue NewSel = foldBinOpIntoSelect(N))
15227 if (SDValue V = combineRepeatedFPDivisors(N))
15230 if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
15231 // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
15232 if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
15233 // Compute the reciprocal 1.0 / c2.
15234 const APFloat &N1APF = N1CFP->getValueAPF();
15235 APFloat Recip(N1APF.getSemantics(), 1); // 1.0
15236 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
15237 // Only do the transform if the reciprocal is a legal fp immediate that
15238 // isn't too nasty (eg NaN, denormal, ...).
15239 if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
15240 (!LegalOperations ||
15241 // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
15242 // backend)... we should handle this gracefully after Legalize.
15243 // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
15244 TLI.isOperationLegal(ISD::ConstantFP, VT) ||
15245 TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
15246 return DAG.getNode(ISD::FMUL, DL, VT, N0,
15247 DAG.getConstantFP(Recip, DL, VT));
15250 // If this FDIV is part of a reciprocal square root, it may be folded
15251 // into a target-specific square root estimate instruction.
15252 if (N1.getOpcode() == ISD::FSQRT) {
15253 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
15254 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
15255 } else if (N1.getOpcode() == ISD::FP_EXTEND &&
15256 N1.getOperand(0).getOpcode() == ISD::FSQRT) {
15258 buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
15259 RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
15260 AddToWorklist(RV.getNode());
15261 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
15263 } else if (N1.getOpcode() == ISD::FP_ROUND &&
15264 N1.getOperand(0).getOpcode() == ISD::FSQRT) {
15266 buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
15267 RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
15268 AddToWorklist(RV.getNode());
15269 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
15271 } else if (N1.getOpcode() == ISD::FMUL) {
15272 // Look through an FMUL. Even though this won't remove the FDIV directly,
15273 // it's still worthwhile to get rid of the FSQRT if possible.
15275 if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
15276 Sqrt = N1.getOperand(0);
15277 Y = N1.getOperand(1);
15278 } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
15279 Sqrt = N1.getOperand(1);
15280 Y = N1.getOperand(0);
15282 if (Sqrt.getNode()) {
15283 // If the other multiply operand is known positive, pull it into the
15284 // sqrt. That will eliminate the division if we convert to an estimate.
15285 if (Flags.hasAllowReassociation() && N1.hasOneUse() &&
15286 N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse()) {
15288 if (Y.getOpcode() == ISD::FABS && Y.hasOneUse())
15289 A = Y.getOperand(0);
15290 else if (Y == Sqrt.getOperand(0))
15293 // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)
15294 // X / (A * sqrt(A)) --> X / sqrt(A*A*A) --> X * rsqrt(A*A*A)
15295 SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A);
15297 DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0));
15298 if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
15299 return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt);
15301 // Estimate creation failed. Clean up speculatively created nodes.
15302 recursivelyDeleteUnusedNodes(AAZ.getNode());
15306 // We found a FSQRT, so try to make this fold:
15307 // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
15308 if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
15309 SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y);
15310 AddToWorklist(Div.getNode());
15311 return DAG.getNode(ISD::FMUL, DL, VT, N0, Div);
15316 // Fold into a reciprocal estimate and multiply instead of a real divide.
15317 if (Options.NoInfsFPMath || Flags.hasNoInfs())
15318 if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
15322 // Fold X/Sqrt(X) -> Sqrt(X)
15323 if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
15324 (Options.UnsafeFPMath || Flags.hasAllowReassociation()))
15325 if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
15328 // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
15329 TargetLowering::NegatibleCost CostN0 =
15330 TargetLowering::NegatibleCost::Expensive;
15331 TargetLowering::NegatibleCost CostN1 =
15332 TargetLowering::NegatibleCost::Expensive;
15334 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
15336 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
15337 if (NegN0 && NegN1 &&
15338 (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
15339 CostN1 == TargetLowering::NegatibleCost::Cheaper))
15340 return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1);
15345 SDValue DAGCombiner::visitFREM(SDNode *N) {
15346 SDValue N0 = N->getOperand(0);
15347 SDValue N1 = N->getOperand(1);
15348 EVT VT = N->getValueType(0);
15349 SDNodeFlags Flags = N->getFlags();
15350 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15352 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
15355 // fold (frem c1, c2) -> fmod(c1,c2)
15356 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, SDLoc(N), VT, {N0, N1}))
15359 if (SDValue NewSel = foldBinOpIntoSelect(N))
15365 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
15366 SDNodeFlags Flags = N->getFlags();
15367 const TargetOptions &Options = DAG.getTarget().Options;
15369 // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
15370 // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
15371 if (!Flags.hasApproximateFuncs() ||
15372 (!Options.NoInfsFPMath && !Flags.hasNoInfs()))
15375 SDValue N0 = N->getOperand(0);
15376 if (TLI.isFsqrtCheap(N0, DAG))
15379 // FSQRT nodes have flags that propagate to the created nodes.
15380 // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
15381 // transform the fdiv, we may produce a sub-optimal estimate sequence
15382 // because the reciprocal calculation may not have to filter out a
15384 return buildSqrtEstimate(N0, Flags);
15387 /// copysign(x, fp_extend(y)) -> copysign(x, y)
15388 /// copysign(x, fp_round(y)) -> copysign(x, y)
15389 static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
15390 SDValue N1 = N->getOperand(1);
15391 if ((N1.getOpcode() == ISD::FP_EXTEND ||
15392 N1.getOpcode() == ISD::FP_ROUND)) {
15393 EVT N1VT = N1->getValueType(0);
15394 EVT N1Op0VT = N1->getOperand(0).getValueType();
15396 // Always fold no-op FP casts.
15397 if (N1VT == N1Op0VT)
15400 // Do not optimize out type conversion of f128 type yet.
15401 // For some targets like x86_64, configuration is changed to keep one f128
15402 // value in one SSE register, but instruction selection cannot handle
15403 // FCOPYSIGN on SSE registers yet.
15404 if (N1Op0VT == MVT::f128)
15407 // Avoid mismatched vector operand types, for better instruction selection.
15408 if (N1Op0VT.isVector())
15416 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
15417 SDValue N0 = N->getOperand(0);
15418 SDValue N1 = N->getOperand(1);
15419 EVT VT = N->getValueType(0);
15421 // fold (fcopysign c1, c2) -> fcopysign(c1,c2)
15423 DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, SDLoc(N), VT, {N0, N1}))
15426 if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
15427 const APFloat &V = N1C->getValueAPF();
15428 // copysign(x, c1) -> fabs(x) iff ispos(c1)
15429 // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
15430 if (!V.isNegative()) {
15431 if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
15432 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
15434 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
15435 return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
15436 DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
15440 // copysign(fabs(x), y) -> copysign(x, y)
15441 // copysign(fneg(x), y) -> copysign(x, y)
15442 // copysign(copysign(x,z), y) -> copysign(x, y)
15443 if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
15444 N0.getOpcode() == ISD::FCOPYSIGN)
15445 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);
15447 // copysign(x, abs(y)) -> abs(x)
15448 if (N1.getOpcode() == ISD::FABS)
15449 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
15451 // copysign(x, copysign(y,z)) -> copysign(x, z)
15452 if (N1.getOpcode() == ISD::FCOPYSIGN)
15453 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));
15455 // copysign(x, fp_extend(y)) -> copysign(x, y)
15456 // copysign(x, fp_round(y)) -> copysign(x, y)
15457 if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
15458 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
15463 SDValue DAGCombiner::visitFPOW(SDNode *N) {
15464 ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1));
15467 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15469 // Try to convert x ** (1/3) into cube root.
15470 // TODO: Handle the various flavors of long double.
15471 // TODO: Since we're approximating, we don't need an exact 1/3 exponent.
15472 // Some range near 1/3 should be fine.
15473 EVT VT = N->getValueType(0);
15474 if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) ||
15475 (VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) {
15476 // pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0.
15477 // pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf.
15478 // pow(-val, 1/3) = nan; cbrt(-val) = -num.
15479 // For regular numbers, rounding may cause the results to differ.
15480 // Therefore, we require { nsz ninf nnan afn } for this transform.
15481 // TODO: We could select out the special cases if we don't have nsz/ninf.
15482 SDNodeFlags Flags = N->getFlags();
15483 if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || !Flags.hasNoNaNs() ||
15484 !Flags.hasApproximateFuncs())
15487 // Do not create a cbrt() libcall if the target does not have it, and do not
15488 // turn a pow that has lowering support into a cbrt() libcall.
15489 if (!DAG.getLibInfo().has(LibFunc_cbrt) ||
15490 (!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) &&
15491 DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT)))
15494 return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0));
15497 // Try to convert x ** (1/4) and x ** (3/4) into square roots.
15498 // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case.
15499 // TODO: This could be extended (using a target hook) to handle smaller
15500 // power-of-2 fractional exponents.
15501 bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25);
15502 bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75);
15503 if (ExponentIs025 || ExponentIs075) {
15504 // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0.
15505 // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN.
15506 // pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0.
15507 // pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN.
15508 // For regular numbers, rounding may cause the results to differ.
15509 // Therefore, we require { nsz ninf afn } for this transform.
15510 // TODO: We could select out the special cases if we don't have nsz/ninf.
15511 SDNodeFlags Flags = N->getFlags();
15513 // We only need no signed zeros for the 0.25 case.
15514 if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() ||
15515 !Flags.hasApproximateFuncs())
15518 // Don't double the number of libcalls. We are trying to inline fast code.
15519 if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT))
15522 // Assume that libcalls are the smallest code.
15523 // TODO: This restriction should probably be lifted for vectors.
15527 // pow(X, 0.25) --> sqrt(sqrt(X))
15529 SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0));
15530 SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt);
15533 // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X))
15534 return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt);
15540 static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
15541 const TargetLowering &TLI) {
15542 // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
15543 // replacing casts with a libcall. We also must be allowed to ignore -0.0
15544 // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
15545 // conversions would return +0.0.
15546 // FIXME: We should be able to use node-level FMF here.
15547 // TODO: If strict math, should we use FABS (+ range check for signed cast)?
15548 EVT VT = N->getValueType(0);
15549 if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
15550 !DAG.getTarget().Options.NoSignedZerosFPMath)
15553 // fptosi/fptoui round towards zero, so converting from FP to integer and
15554 // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
15555 SDValue N0 = N->getOperand(0);
15556 if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
15557 N0.getOperand(0).getValueType() == VT)
15558 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
15560 if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
15561 N0.getOperand(0).getValueType() == VT)
15562 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
15567 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
15568 SDValue N0 = N->getOperand(0);
15569 EVT VT = N->getValueType(0);
15570 EVT OpVT = N0.getValueType();
15572 // [us]itofp(undef) = 0, because the result value is bounded.
15574 return DAG.getConstantFP(0.0, SDLoc(N), VT);
15576 // fold (sint_to_fp c1) -> c1fp
15577 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
15578 // ...but only if the target supports immediate floating-point values
15579 (!LegalOperations ||
15580 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
15581 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
15583 // If the input is a legal type, and SINT_TO_FP is not legal on this target,
15584 // but UINT_TO_FP is legal on this target, try to convert.
15585 if (!hasOperation(ISD::SINT_TO_FP, OpVT) &&
15586 hasOperation(ISD::UINT_TO_FP, OpVT)) {
15587 // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
15588 if (DAG.SignBitIsZero(N0))
15589 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
15592 // The next optimizations are desirable only if SELECT_CC can be lowered.
15593 // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)
15594 if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
15596 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
15598 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),
15599 DAG.getConstantFP(0.0, DL, VT));
15602 // fold (sint_to_fp (zext (setcc x, y, cc))) ->
15603 // (select (setcc x, y, cc), 1.0, 0.0)
15604 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
15605 N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&
15606 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
15608 return DAG.getSelect(DL, VT, N0.getOperand(0),
15609 DAG.getConstantFP(1.0, DL, VT),
15610 DAG.getConstantFP(0.0, DL, VT));
15613 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
15619 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
15620 SDValue N0 = N->getOperand(0);
15621 EVT VT = N->getValueType(0);
15622 EVT OpVT = N0.getValueType();
15624 // [us]itofp(undef) = 0, because the result value is bounded.
15626 return DAG.getConstantFP(0.0, SDLoc(N), VT);
15628 // fold (uint_to_fp c1) -> c1fp
15629 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
15630 // ...but only if the target supports immediate floating-point values
15631 (!LegalOperations ||
15632 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
15633 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
15635 // If the input is a legal type, and UINT_TO_FP is not legal on this target,
15636 // but SINT_TO_FP is legal on this target, try to convert.
15637 if (!hasOperation(ISD::UINT_TO_FP, OpVT) &&
15638 hasOperation(ISD::SINT_TO_FP, OpVT)) {
15639 // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
15640 if (DAG.SignBitIsZero(N0))
15641 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
15644 // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)
15645 if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
15646 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
15648 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),
15649 DAG.getConstantFP(0.0, DL, VT));
15652 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
15658 // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
15659 static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
15660 SDValue N0 = N->getOperand(0);
15661 EVT VT = N->getValueType(0);
15663 if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
15666 SDValue Src = N0.getOperand(0);
15667 EVT SrcVT = Src.getValueType();
15668 bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
15669 bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
15671 // We can safely assume the conversion won't overflow the output range,
15672 // because (for example) (uint8_t)18293.f is undefined behavior.
15674 // Since we can assume the conversion won't overflow, our decision as to
15675 // whether the input will fit in the float should depend on the minimum
15676 // of the input range and output range.
15678 // This means this is also safe for a signed input and unsigned output, since
15679 // a negative input would lead to undefined behavior.
15680 unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
15681 unsigned OutputSize = (int)VT.getScalarSizeInBits();
15682 unsigned ActualSize = std::min(InputSize, OutputSize);
15683 const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
15685 // We can only fold away the float conversion if the input range can be
15686 // represented exactly in the float range.
15687 if (APFloat::semanticsPrecision(sem) >= ActualSize) {
15688 if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
15689 unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
15690 : ISD::ZERO_EXTEND;
15691 return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
15693 if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
15694 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
15695 return DAG.getBitcast(VT, Src);
15700 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
15701 SDValue N0 = N->getOperand(0);
15702 EVT VT = N->getValueType(0);
15704 // fold (fp_to_sint undef) -> undef
15706 return DAG.getUNDEF(VT);
15708 // fold (fp_to_sint c1fp) -> c1
15709 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15710 return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
15712 return FoldIntToFPToInt(N, DAG);
15715 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
15716 SDValue N0 = N->getOperand(0);
15717 EVT VT = N->getValueType(0);
15719 // fold (fp_to_uint undef) -> undef
15721 return DAG.getUNDEF(VT);
15723 // fold (fp_to_uint c1fp) -> c1
15724 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15725 return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
15727 return FoldIntToFPToInt(N, DAG);
15730 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
15731 SDValue N0 = N->getOperand(0);
15732 SDValue N1 = N->getOperand(1);
15733 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
15734 EVT VT = N->getValueType(0);
15736 // fold (fp_round c1fp) -> c1fp
15738 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);
15740 // fold (fp_round (fp_extend x)) -> x
15741 if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
15742 return N0.getOperand(0);
15744 // fold (fp_round (fp_round x)) -> (fp_round x)
15745 if (N0.getOpcode() == ISD::FP_ROUND) {
15746 const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
15747 const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;
15749 // Skip this folding if it results in an fp_round from f80 to f16.
15751 // f80 to f16 always generates an expensive (and as yet, unimplemented)
15752 // libcall to __truncxfhf2 instead of selecting native f16 conversion
15753 // instructions from f32 or f64. Moreover, the first (value-preserving)
15754 // fp_round from f80 to either f32 or f64 may become a NOP in platforms like
15756 if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
15759 // If the first fp_round isn't a value preserving truncation, it might
15760 // introduce a tie in the second fp_round, that wouldn't occur in the
15761 // single-step fp_round we want to fold to.
15762 // In other words, double rounding isn't the same as rounding.
15763 // Also, this is a value preserving truncation iff both fp_round's are.
15764 if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) {
15766 return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0),
15767 DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL));
15771 // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
15772 if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse()) {
15773 SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
15774 N0.getOperand(0), N1);
15775 AddToWorklist(Tmp.getNode());
15776 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
15777 Tmp, N0.getOperand(1));
15780 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
15786 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
15787 SDValue N0 = N->getOperand(0);
15788 EVT VT = N->getValueType(0);
15790 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
15791 if (N->hasOneUse() &&
15792 N->use_begin()->getOpcode() == ISD::FP_ROUND)
15795 // fold (fp_extend c1fp) -> c1fp
15796 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15797 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
15799 // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
15800 if (N0.getOpcode() == ISD::FP16_TO_FP &&
15801 TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
15802 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));
15804 // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
15806 if (N0.getOpcode() == ISD::FP_ROUND
15807 && N0.getConstantOperandVal(1) == 1) {
15808 SDValue In = N0.getOperand(0);
15809 if (In.getValueType() == VT) return In;
15810 if (VT.bitsLT(In.getValueType()))
15811 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
15812 In, N0.getOperand(1));
15813 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
15816 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
15817 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
15818 TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
15819 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15820 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
15822 LN0->getBasePtr(), N0.getValueType(),
15823 LN0->getMemOperand());
15824 CombineTo(N, ExtLoad);
15825 CombineTo(N0.getNode(),
15826 DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
15827 N0.getValueType(), ExtLoad,
15828 DAG.getIntPtrConstant(1, SDLoc(N0))),
15829 ExtLoad.getValue(1));
15830 return SDValue(N, 0); // Return N so it doesn't get rechecked!
15833 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
15839 SDValue DAGCombiner::visitFCEIL(SDNode *N) {
15840 SDValue N0 = N->getOperand(0);
15841 EVT VT = N->getValueType(0);
15843 // fold (fceil c1) -> fceil(c1)
15844 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15845 return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
15850 SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
15851 SDValue N0 = N->getOperand(0);
15852 EVT VT = N->getValueType(0);
15854 // fold (ftrunc c1) -> ftrunc(c1)
15855 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15856 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
15858 // fold ftrunc (known rounded int x) -> x
15859 // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
15860 // likely to be generated to extract integer from a rounded floating value.
15861 switch (N0.getOpcode()) {
15865 case ISD::FNEARBYINT:
15874 SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
15875 SDValue N0 = N->getOperand(0);
15876 EVT VT = N->getValueType(0);
15878 // fold (ffloor c1) -> ffloor(c1)
15879 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15880 return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
15885 SDValue DAGCombiner::visitFNEG(SDNode *N) {
15886 SDValue N0 = N->getOperand(0);
15887 EVT VT = N->getValueType(0);
15888 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15890 // Constant fold FNEG.
15891 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15892 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
15894 if (SDValue NegN0 =
15895 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))
15898 // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
15899 // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
15900 // know it was called from a context with a nsz flag if the input fsub does
15902 if (N0.getOpcode() == ISD::FSUB &&
15903 (DAG.getTarget().Options.NoSignedZerosFPMath ||
15904 N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
15905 return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
15909 if (SDValue Cast = foldSignChangeInBitcast(N))
15915 SDValue DAGCombiner::visitFMinMax(SDNode *N) {
15916 SDValue N0 = N->getOperand(0);
15917 SDValue N1 = N->getOperand(1);
15918 EVT VT = N->getValueType(0);
15919 const SDNodeFlags Flags = N->getFlags();
15920 unsigned Opc = N->getOpcode();
15921 bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
15922 bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
15923 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15926 if (SDValue C = DAG.FoldConstantArithmetic(Opc, SDLoc(N), VT, {N0, N1}))
15929 // Canonicalize to constant on RHS.
15930 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
15931 !DAG.isConstantFPBuildVectorOrConstantFP(N1))
15932 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
15934 if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) {
15935 const APFloat &AF = N1CFP->getValueAPF();
15937 // minnum(X, nan) -> X
15938 // maxnum(X, nan) -> X
15939 // minimum(X, nan) -> nan
15940 // maximum(X, nan) -> nan
15942 return PropagatesNaN ? N->getOperand(1) : N->getOperand(0);
15944 // In the following folds, inf can be replaced with the largest finite
15945 // float, if the ninf flag is set.
15946 if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) {
15947 // minnum(X, -inf) -> -inf
15948 // maxnum(X, +inf) -> +inf
15949 // minimum(X, -inf) -> -inf if nnan
15950 // maximum(X, +inf) -> +inf if nnan
15951 if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs()))
15952 return N->getOperand(1);
15954 // minnum(X, +inf) -> X if nnan
15955 // maxnum(X, -inf) -> X if nnan
15956 // minimum(X, +inf) -> X
15957 // maximum(X, -inf) -> X
15958 if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs()))
15959 return N->getOperand(0);
15966 SDValue DAGCombiner::visitFABS(SDNode *N) {
15967 SDValue N0 = N->getOperand(0);
15968 EVT VT = N->getValueType(0);
15970 // fold (fabs c1) -> fabs(c1)
15971 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15972 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
15974 // fold (fabs (fabs x)) -> (fabs x)
15975 if (N0.getOpcode() == ISD::FABS)
15976 return N->getOperand(0);
15978 // fold (fabs (fneg x)) -> (fabs x)
15979 // fold (fabs (fcopysign x, y)) -> (fabs x)
15980 if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
15981 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
15983 if (SDValue Cast = foldSignChangeInBitcast(N))
15989 SDValue DAGCombiner::visitBRCOND(SDNode *N) {
15990 SDValue Chain = N->getOperand(0);
15991 SDValue N1 = N->getOperand(1);
15992 SDValue N2 = N->getOperand(2);
15994 // BRCOND(FREEZE(cond)) is equivalent to BRCOND(cond) (both are
15995 // nondeterministic jumps).
15996 if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse()) {
15997 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain,
15998 N1->getOperand(0), N2);
16001 // If N is a constant we could fold this into a fallthrough or unconditional
16002 // branch. However that doesn't happen very often in normal code, because
16003 // Instcombine/SimplifyCFG should have handled the available opportunities.
16004 // If we did this folding here, it would be necessary to update the
16005 // MachineBasicBlock CFG, which is awkward.
16007 // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
16009 if (N1.getOpcode() == ISD::SETCC &&
16010 TLI.isOperationLegalOrCustom(ISD::BR_CC,
16011 N1.getOperand(0).getValueType())) {
16012 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
16013 Chain, N1.getOperand(2),
16014 N1.getOperand(0), N1.getOperand(1), N2);
16017 if (N1.hasOneUse()) {
16018 // rebuildSetCC calls visitXor which may change the Chain when there is a
16019 // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes.
16020 HandleSDNode ChainHandle(Chain);
16021 if (SDValue NewN1 = rebuildSetCC(N1))
16022 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other,
16023 ChainHandle.getValue(), NewN1, N2);
16029 SDValue DAGCombiner::rebuildSetCC(SDValue N) {
16030 if (N.getOpcode() == ISD::SRL ||
16031 (N.getOpcode() == ISD::TRUNCATE &&
16032 (N.getOperand(0).hasOneUse() &&
16033 N.getOperand(0).getOpcode() == ISD::SRL))) {
16034 // Look pass the truncate.
16035 if (N.getOpcode() == ISD::TRUNCATE)
16036 N = N.getOperand(0);
16038 // Match this pattern so that we can generate simpler code:
16041 // %b = and i32 %a, 2
16042 // %c = srl i32 %b, 1
16043 // brcond i32 %c ...
16048 // %b = and i32 %a, 2
16049 // %c = setcc eq %b, 0
16052 // This applies only when the AND constant value has one bit set and the
16053 // SRL constant is equal to the log2 of the AND constant. The back-end is
16054 // smart enough to convert the result into a TEST/JMP sequence.
16055 SDValue Op0 = N.getOperand(0);
16056 SDValue Op1 = N.getOperand(1);
16058 if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) {
16059 SDValue AndOp1 = Op0.getOperand(1);
16061 if (AndOp1.getOpcode() == ISD::Constant) {
16062 const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
16064 if (AndConst.isPowerOf2() &&
16065 cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) {
16067 return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()),
16068 Op0, DAG.getConstant(0, DL, Op0.getValueType()),
16075 // Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne))
16076 // Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq))
16077 if (N.getOpcode() == ISD::XOR) {
16078 // Because we may call this on a speculatively constructed
16079 // SimplifiedSetCC Node, we need to simplify this node first.
16080 // Ideally this should be folded into SimplifySetCC and not
16081 // here. For now, grab a handle to N so we don't lose it from
16082 // replacements interal to the visit.
16083 HandleSDNode XORHandle(N);
16084 while (N.getOpcode() == ISD::XOR) {
16085 SDValue Tmp = visitXOR(N.getNode());
16086 // No simplification done.
16087 if (!Tmp.getNode())
16089 // Returning N is form in-visit replacement that may invalidated
16090 // N. Grab value from Handle.
16091 if (Tmp.getNode() == N.getNode())
16092 N = XORHandle.getValue();
16093 else // Node simplified. Try simplifying again.
16097 if (N.getOpcode() != ISD::XOR)
16100 SDValue Op0 = N->getOperand(0);
16101 SDValue Op1 = N->getOperand(1);
16103 if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
16104 bool Equal = false;
16105 // (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
16106 if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR &&
16107 Op0.getValueType() == MVT::i1) {
16109 Op0 = N->getOperand(0);
16110 Op1 = N->getOperand(1);
16114 EVT SetCCVT = N.getValueType();
16116 SetCCVT = getSetCCResultType(SetCCVT);
16117 // Replace the uses of XOR with SETCC
16118 return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1,
16119 Equal ? ISD::SETEQ : ISD::SETNE);
16126 // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
16128 SDValue DAGCombiner::visitBR_CC(SDNode *N) {
16129 CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
16130 SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
16132 // If N is a constant we could fold this into a fallthrough or unconditional
16133 // branch. However that doesn't happen very often in normal code, because
16134 // Instcombine/SimplifyCFG should have handled the available opportunities.
16135 // If we did this folding here, it would be necessary to update the
16136 // MachineBasicBlock CFG, which is awkward.
16138 // Use SimplifySetCC to simplify SETCC's.
16139 SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
16140 CondLHS, CondRHS, CC->get(), SDLoc(N),
16142 if (Simp.getNode()) AddToWorklist(Simp.getNode());
16144 // fold to a simpler setcc
16145 if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
16146 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
16147 N->getOperand(0), Simp.getOperand(2),
16148 Simp.getOperand(0), Simp.getOperand(1),
16154 static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
16155 bool &IsLoad, bool &IsMasked, SDValue &Ptr,
16156 const TargetLowering &TLI) {
16157 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
16158 if (LD->isIndexed())
16160 EVT VT = LD->getMemoryVT();
16161 if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT))
16163 Ptr = LD->getBasePtr();
16164 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
16165 if (ST->isIndexed())
16167 EVT VT = ST->getMemoryVT();
16168 if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT))
16170 Ptr = ST->getBasePtr();
16172 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
16173 if (LD->isIndexed())
16175 EVT VT = LD->getMemoryVT();
16176 if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) &&
16177 !TLI.isIndexedMaskedLoadLegal(Dec, VT))
16179 Ptr = LD->getBasePtr();
16181 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
16182 if (ST->isIndexed())
16184 EVT VT = ST->getMemoryVT();
16185 if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) &&
16186 !TLI.isIndexedMaskedStoreLegal(Dec, VT))
16188 Ptr = ST->getBasePtr();
16197 /// Try turning a load/store into a pre-indexed load/store when the base
16198 /// pointer is an add or subtract and it has other uses besides the load/store.
16199 /// After the transformation, the new indexed load/store has effectively folded
16200 /// the add/subtract in and all of its other uses are redirected to the
16201 /// new load/store.
16202 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
16203 if (Level < AfterLegalizeDAG)
16206 bool IsLoad = true;
16207 bool IsMasked = false;
16209 if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked,
16213 // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
16214 // out. There is no reason to make this a preinc/predec.
16215 if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
16219 // Ask the target to do addressing mode selection.
16222 ISD::MemIndexedMode AM = ISD::UNINDEXED;
16223 if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
16226 // Backends without true r+i pre-indexed forms may need to pass a
16227 // constant base with a variable offset so that constant coercion
16228 // will work with the patterns in canonical form.
16229 bool Swapped = false;
16230 if (isa<ConstantSDNode>(BasePtr)) {
16231 std::swap(BasePtr, Offset);
16235 // Don't create a indexed load / store with zero offset.
16236 if (isNullConstant(Offset))
16239 // Try turning it into a pre-indexed load / store except when:
16240 // 1) The new base ptr is a frame index.
16241 // 2) If N is a store and the new base ptr is either the same as or is a
16242 // predecessor of the value being stored.
16243 // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
16244 // that would create a cycle.
16245 // 4) All uses are load / store ops that use it as old base ptr.
16247 // Check #1. Preinc'ing a frame index would require copying the stack pointer
16248 // (plus the implicit offset) to a register to preinc anyway.
16249 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
16254 SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue()
16255 : cast<StoreSDNode>(N)->getValue();
16257 // Would require a copy.
16258 if (Val == BasePtr)
16261 // Would create a cycle.
16262 if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode()))
16266 // Caches for hasPredecessorHelper.
16267 SmallPtrSet<const SDNode *, 32> Visited;
16268 SmallVector<const SDNode *, 16> Worklist;
16269 Worklist.push_back(N);
16271 // If the offset is a constant, there may be other adds of constants that
16272 // can be folded with this one. We should do this to avoid having to keep
16273 // a copy of the original base pointer.
16274 SmallVector<SDNode *, 16> OtherUses;
16275 if (isa<ConstantSDNode>(Offset))
16276 for (SDNode::use_iterator UI = BasePtr->use_begin(),
16277 UE = BasePtr->use_end();
16279 SDUse &Use = UI.getUse();
16280 // Skip the use that is Ptr and uses of other results from BasePtr's
16281 // node (important for nodes that return multiple results).
16282 if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
16285 if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
16288 if (Use.getUser()->getOpcode() != ISD::ADD &&
16289 Use.getUser()->getOpcode() != ISD::SUB) {
16294 SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
16295 if (!isa<ConstantSDNode>(Op1)) {
16300 // FIXME: In some cases, we can be smarter about this.
16301 if (Op1.getValueType() != Offset.getValueType()) {
16306 OtherUses.push_back(Use.getUser());
16310 std::swap(BasePtr, Offset);
16312 // Now check for #3 and #4.
16313 bool RealUse = false;
16315 for (SDNode *Use : Ptr->uses()) {
16318 if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
16321 // If Ptr may be folded in addressing mode of other use, then it's
16322 // not profitable to do this transformation.
16323 if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
16333 Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
16336 DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
16339 Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
16342 Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr,
16347 LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
16348 Result.dump(&DAG); dbgs() << '\n');
16349 WorklistRemover DeadNodes(*this);
16351 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
16352 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
16354 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
16357 // Finally, since the node is now dead, remove it from the graph.
16358 deleteAndRecombine(N);
16361 std::swap(BasePtr, Offset);
16363 // Replace other uses of BasePtr that can be updated to use Ptr
16364 for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
16365 unsigned OffsetIdx = 1;
16366 if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
16368 assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
16369 BasePtr.getNode() && "Expected BasePtr operand");
16371 // We need to replace ptr0 in the following expression:
16372 // x0 * offset0 + y0 * ptr0 = t0
16374 // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
16376 // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
16377 // indexed load/store and the expression that needs to be re-written.
16379 // Therefore, we have:
16380 // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
16382 auto *CN = cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
16383 const APInt &Offset0 = CN->getAPIntValue();
16384 const APInt &Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
16385 int X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
16386 int Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
16387 int X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
16388 int Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
16390 unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;
16392 APInt CNV = Offset0;
16393 if (X0 < 0) CNV = -CNV;
16394 if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
16395 else CNV = CNV - Offset1;
16397 SDLoc DL(OtherUses[i]);
16399 // We can now generate the new expression.
16400 SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
16401 SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);
16403 SDValue NewUse = DAG.getNode(Opcode,
16405 OtherUses[i]->getValueType(0), NewOp1, NewOp2);
16406 DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
16407 deleteAndRecombine(OtherUses[i]);
16410 // Replace the uses of Ptr with uses of the updated base value.
16411 DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0));
16412 deleteAndRecombine(Ptr.getNode());
16413 AddToWorklist(Result.getNode());
16418 static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
16419 SDValue &BasePtr, SDValue &Offset,
16420 ISD::MemIndexedMode &AM,
16422 const TargetLowering &TLI) {
16424 (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))
16427 if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))
16430 // Don't create a indexed load / store with zero offset.
16431 if (isNullConstant(Offset))
16434 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
16437 SmallPtrSet<const SDNode *, 32> Visited;
16438 for (SDNode *Use : BasePtr->uses()) {
16439 if (Use == Ptr.getNode())
16442 // No if there's a later user which could perform the index instead.
16443 if (isa<MemSDNode>(Use)) {
16444 bool IsLoad = true;
16445 bool IsMasked = false;
16447 if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
16448 IsMasked, OtherPtr, TLI)) {
16449 SmallVector<const SDNode *, 2> Worklist;
16450 Worklist.push_back(Use);
16451 if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
16456 // If all the uses are load / store addresses, then don't do the
16458 if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
16459 for (SDNode *UseUse : Use->uses())
16460 if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
16467 static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
16468 bool &IsMasked, SDValue &Ptr,
16469 SDValue &BasePtr, SDValue &Offset,
16470 ISD::MemIndexedMode &AM,
16472 const TargetLowering &TLI) {
16473 if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
16474 IsMasked, Ptr, TLI) ||
16478 // Try turning it into a post-indexed load / store except when
16479 // 1) All uses are load / store ops that use it as base ptr (and
16480 // it may be folded as addressing mmode).
16481 // 2) Op must be independent of N, i.e. Op is neither a predecessor
16482 // nor a successor of N. Otherwise, if Op is folded that would
16484 for (SDNode *Op : Ptr->uses()) {
16486 if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
16490 SmallPtrSet<const SDNode *, 32> Visited;
16491 SmallVector<const SDNode *, 8> Worklist;
16492 // Ptr is predecessor to both N and Op.
16493 Visited.insert(Ptr.getNode());
16494 Worklist.push_back(N);
16495 Worklist.push_back(Op);
16496 if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
16497 !SDNode::hasPredecessorHelper(Op, Visited, Worklist))
16503 /// Try to combine a load/store with a add/sub of the base pointer node into a
16504 /// post-indexed load/store. The transformation folded the add/subtract into the
16505 /// new indexed load/store effectively and all of its uses are redirected to the
16506 /// new load/store.
16507 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
16508 if (Level < AfterLegalizeDAG)
16511 bool IsLoad = true;
16512 bool IsMasked = false;
16516 ISD::MemIndexedMode AM = ISD::UNINDEXED;
16517 SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,
16518 Offset, AM, DAG, TLI);
16524 Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
16526 : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
16527 BasePtr, Offset, AM);
16529 Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
16530 BasePtr, Offset, AM)
16531 : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
16532 BasePtr, Offset, AM);
16533 ++PostIndexedNodes;
16535 LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: ";
16536 Result.dump(&DAG); dbgs() << '\n');
16537 WorklistRemover DeadNodes(*this);
16539 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
16540 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
16542 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
16545 // Finally, since the node is now dead, remove it from the graph.
16546 deleteAndRecombine(N);
16548 // Replace the uses of Use with uses of the updated base value.
16549 DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
16550 Result.getValue(IsLoad ? 1 : 0));
16551 deleteAndRecombine(Op);
16555 /// Return the base-pointer arithmetic from an indexed \p LD.
16556 SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
16557 ISD::MemIndexedMode AM = LD->getAddressingMode();
16558 assert(AM != ISD::UNINDEXED);
16559 SDValue BP = LD->getOperand(1);
16560 SDValue Inc = LD->getOperand(2);
16562 // Some backends use TargetConstants for load offsets, but don't expect
16563 // TargetConstants in general ADD nodes. We can convert these constants into
16564 // regular Constants (if the constant is not opaque).
16565 assert((Inc.getOpcode() != ISD::TargetConstant ||
16566 !cast<ConstantSDNode>(Inc)->isOpaque()) &&
16567 "Cannot split out indexing using opaque target constants");
16568 if (Inc.getOpcode() == ISD::TargetConstant) {
16569 ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
16570 Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
16571 ConstInc->getValueType(0));
16575 (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
16576 return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
16579 static inline ElementCount numVectorEltsOrZero(EVT T) {
16580 return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
16583 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
16584 Val = ST->getValue();
16585 EVT STType = Val.getValueType();
16586 EVT STMemType = ST->getMemoryVT();
16587 if (STType == STMemType)
16589 if (isTypeLegal(STMemType))
16590 return false; // fail.
16591 if (STType.isFloatingPoint() && STMemType.isFloatingPoint() &&
16592 TLI.isOperationLegal(ISD::FTRUNC, STMemType)) {
16593 Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val);
16596 if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) &&
16597 STType.isInteger() && STMemType.isInteger()) {
16598 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val);
16601 if (STType.getSizeInBits() == STMemType.getSizeInBits()) {
16602 Val = DAG.getBitcast(STMemType, Val);
16605 return false; // fail.
16608 bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
16609 EVT LDMemType = LD->getMemoryVT();
16610 EVT LDType = LD->getValueType(0);
16611 assert(Val.getValueType() == LDMemType &&
16612 "Attempting to extend value of non-matching type");
16613 if (LDType == LDMemType)
16615 if (LDMemType.isInteger() && LDType.isInteger()) {
16616 switch (LD->getExtensionType()) {
16617 case ISD::NON_EXTLOAD:
16618 Val = DAG.getBitcast(LDType, Val);
16621 Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val);
16623 case ISD::SEXTLOAD:
16624 Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val);
16626 case ISD::ZEXTLOAD:
16627 Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val);
16634 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
16635 if (OptLevel == CodeGenOpt::None || !LD->isSimple())
16637 SDValue Chain = LD->getOperand(0);
16638 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
16639 // TODO: Relax this restriction for unordered atomics (see D66309)
16640 if (!ST || !ST->isSimple())
16643 EVT LDType = LD->getValueType(0);
16644 EVT LDMemType = LD->getMemoryVT();
16645 EVT STMemType = ST->getMemoryVT();
16646 EVT STType = ST->getValue().getValueType();
16648 // There are two cases to consider here:
16649 // 1. The store is fixed width and the load is scalable. In this case we
16650 // don't know at compile time if the store completely envelops the load
16651 // so we abandon the optimisation.
16652 // 2. The store is scalable and the load is fixed width. We could
16653 // potentially support a limited number of cases here, but there has been
16654 // no cost-benefit analysis to prove it's worth it.
16655 bool LdStScalable = LDMemType.isScalableVector();
16656 if (LdStScalable != STMemType.isScalableVector())
16659 // If we are dealing with scalable vectors on a big endian platform the
16660 // calculation of offsets below becomes trickier, since we do not know at
16661 // compile time the absolute size of the vector. Until we've done more
16662 // analysis on big-endian platforms it seems better to bail out for now.
16663 if (LdStScalable && DAG.getDataLayout().isBigEndian())
16666 BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
16667 BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
16669 if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
16672 // Normalize for Endianness. After this Offset=0 will denote that the least
16673 // significant bit in the loaded value maps to the least significant bit in
16674 // the stored value). With Offset=n (for n > 0) the loaded value starts at the
16675 // n:th least significant byte of the stored value.
16676 if (DAG.getDataLayout().isBigEndian())
16677 Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() -
16678 (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) /
16682 // Check that the stored value cover all bits that are loaded.
16685 TypeSize LdMemSize = LDMemType.getSizeInBits();
16686 TypeSize StMemSize = STMemType.getSizeInBits();
16688 STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
16690 STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <=
16691 StMemSize.getFixedSize());
16693 auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
16694 if (LD->isIndexed()) {
16695 // Cannot handle opaque target constants and we must respect the user's
16696 // request not to split indexes from loads.
16697 if (!canSplitIdx(LD))
16699 SDValue Idx = SplitIndexingFromLoad(LD);
16700 SDValue Ops[] = {Val, Idx, Chain};
16701 return CombineTo(LD, Ops, 3);
16703 return CombineTo(LD, Val, Chain);
16709 // Memory as copy space (potentially masked).
16710 if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
16711 // Simple case: Direct non-truncating forwarding
16712 if (LDType.getSizeInBits() == LdMemSize)
16713 return ReplaceLd(LD, ST->getValue(), Chain);
16714 // Can we model the truncate and extension with an and mask?
16715 if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
16716 !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
16717 // Mask to size of LDMemType
16719 DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
16720 StMemSize.getFixedSize()),
16721 SDLoc(ST), STType);
16722 auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
16723 return ReplaceLd(LD, Val, Chain);
16727 // TODO: Deal with nonzero offset.
16728 if (LD->getBasePtr().isUndef() || Offset != 0)
16730 // Model necessary truncations / extenstions.
16732 // Truncate Value To Stored Memory Size.
16734 if (!getTruncatedStoreValue(ST, Val))
16736 if (!isTypeLegal(LDMemType))
16738 if (STMemType != LDMemType) {
16739 // TODO: Support vectors? This requires extract_subvector/bitcast.
16740 if (!STMemType.isVector() && !LDMemType.isVector() &&
16741 STMemType.isInteger() && LDMemType.isInteger())
16742 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
16746 if (!extendLoadedValueToExtension(LD, Val))
16748 return ReplaceLd(LD, Val, Chain);
16751 // On failure, cleanup dead nodes we may have created.
16752 if (Val->use_empty())
16753 deleteAndRecombine(Val.getNode());
16757 SDValue DAGCombiner::visitLOAD(SDNode *N) {
16758 LoadSDNode *LD = cast<LoadSDNode>(N);
16759 SDValue Chain = LD->getChain();
16760 SDValue Ptr = LD->getBasePtr();
16762 // If load is not volatile and there are no uses of the loaded value (and
16763 // the updated indexed value in case of indexed loads), change uses of the
16764 // chain value into uses of the chain input (i.e. delete the dead load).
16765 // TODO: Allow this for unordered atomics (see D66309)
16766 if (LD->isSimple()) {
16767 if (N->getValueType(1) == MVT::Other) {
16768 // Unindexed loads.
16769 if (!N->hasAnyUseOfValue(0)) {
16770 // It's not safe to use the two value CombineTo variant here. e.g.
16771 // v1, chain2 = load chain1, loc
16772 // v2, chain3 = load chain2, loc
16774 // Now we replace use of chain2 with chain1. This makes the second load
16775 // isomorphic to the one we are deleting, and thus makes this load live.
16776 LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
16777 dbgs() << "\nWith chain: "; Chain.dump(&DAG);
16779 WorklistRemover DeadNodes(*this);
16780 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
16781 AddUsersToWorklist(Chain.getNode());
16782 if (N->use_empty())
16783 deleteAndRecombine(N);
16785 return SDValue(N, 0); // Return N so it doesn't get rechecked!
16789 assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
16791 // If this load has an opaque TargetConstant offset, then we cannot split
16792 // the indexing into an add/sub directly (that TargetConstant may not be
16793 // valid for a different type of node, and we cannot convert an opaque
16794 // target constant into a regular constant).
16795 bool CanSplitIdx = canSplitIdx(LD);
16797 if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) {
16798 SDValue Undef = DAG.getUNDEF(N->getValueType(0));
16800 if (N->hasAnyUseOfValue(1) && CanSplitIdx) {
16801 Index = SplitIndexingFromLoad(LD);
16802 // Try to fold the base pointer arithmetic into subsequent loads and
16804 AddUsersToWorklist(N);
16806 Index = DAG.getUNDEF(N->getValueType(1));
16807 LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
16808 dbgs() << "\nWith: "; Undef.dump(&DAG);
16809 dbgs() << " and 2 other values\n");
16810 WorklistRemover DeadNodes(*this);
16811 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
16812 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
16813 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
16814 deleteAndRecombine(N);
16815 return SDValue(N, 0); // Return N so it doesn't get rechecked!
16820 // If this load is directly stored, replace the load value with the stored
16822 if (auto V = ForwardStoreValueToDirectLoad(LD))
16825 // Try to infer better alignment information than the load already has.
16826 if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {
16827 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
16828 if (*Alignment > LD->getAlign() &&
16829 isAligned(*Alignment, LD->getSrcValueOffset())) {
16830 SDValue NewLoad = DAG.getExtLoad(
16831 LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
16832 LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,
16833 LD->getMemOperand()->getFlags(), LD->getAAInfo());
16834 // NewLoad will always be N as we are only refining the alignment
16835 assert(NewLoad.getNode() == N);
16841 if (LD->isUnindexed()) {
16842 // Walk up chain skipping non-aliasing memory nodes.
16843 SDValue BetterChain = FindBetterChain(LD, Chain);
16845 // If there is a better chain.
16846 if (Chain != BetterChain) {
16849 // Replace the chain to void dependency.
16850 if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
16851 ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
16852 BetterChain, Ptr, LD->getMemOperand());
16854 ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
16855 LD->getValueType(0),
16856 BetterChain, Ptr, LD->getMemoryVT(),
16857 LD->getMemOperand());
16860 // Create token factor to keep old chain connected.
16861 SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
16862 MVT::Other, Chain, ReplLoad.getValue(1));
16864 // Replace uses with load result and token factor
16865 return CombineTo(N, ReplLoad.getValue(0), Token);
16869 // Try transforming N to an indexed load.
16870 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
16871 return SDValue(N, 0);
16873 // Try to slice up N to more direct loads if the slices are mapped to
16874 // different register banks or pairing can take place.
16875 if (SliceUpLoad(N))
16876 return SDValue(N, 0);
16883 /// Helper structure used to slice a load in smaller loads.
16884 /// Basically a slice is obtained from the following sequence:
16885 /// Origin = load Ty1, Base
16886 /// Shift = srl Ty1 Origin, CstTy Amount
16887 /// Inst = trunc Shift to Ty2
16889 /// Then, it will be rewritten into:
16890 /// Slice = load SliceTy, Base + SliceOffset
16891 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
16893 /// SliceTy is deduced from the number of bits that are actually used to
16895 struct LoadedSlice {
16896 /// Helper structure used to compute the cost of a slice.
16898 /// Are we optimizing for code size.
16899 bool ForCodeSize = false;
16902 unsigned Loads = 0;
16903 unsigned Truncates = 0;
16904 unsigned CrossRegisterBanksCopies = 0;
16905 unsigned ZExts = 0;
16906 unsigned Shift = 0;
16908 explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {}
16910 /// Get the cost of one isolated slice.
16911 Cost(const LoadedSlice &LS, bool ForCodeSize)
16912 : ForCodeSize(ForCodeSize), Loads(1) {
16913 EVT TruncType = LS.Inst->getValueType(0);
16914 EVT LoadedType = LS.getLoadedType();
16915 if (TruncType != LoadedType &&
16916 !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
16920 /// Account for slicing gain in the current cost.
16921 /// Slicing provide a few gains like removing a shift or a
16922 /// truncate. This method allows to grow the cost of the original
16923 /// load with the gain from this slice.
16924 void addSliceGain(const LoadedSlice &LS) {
16925 // Each slice saves a truncate.
16926 const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
16927 if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
16928 LS.Inst->getValueType(0)))
16930 // If there is a shift amount, this slice gets rid of it.
16933 // If this slice can merge a cross register bank copy, account for it.
16934 if (LS.canMergeExpensiveCrossRegisterBankCopy())
16935 ++CrossRegisterBanksCopies;
16938 Cost &operator+=(const Cost &RHS) {
16939 Loads += RHS.Loads;
16940 Truncates += RHS.Truncates;
16941 CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
16942 ZExts += RHS.ZExts;
16943 Shift += RHS.Shift;
16947 bool operator==(const Cost &RHS) const {
16948 return Loads == RHS.Loads && Truncates == RHS.Truncates &&
16949 CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
16950 ZExts == RHS.ZExts && Shift == RHS.Shift;
16953 bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
16955 bool operator<(const Cost &RHS) const {
16956 // Assume cross register banks copies are as expensive as loads.
16957 // FIXME: Do we want some more target hooks?
16958 unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
16959 unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
16960 // Unless we are optimizing for code size, consider the
16961 // expensive operation first.
16962 if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
16963 return ExpensiveOpsLHS < ExpensiveOpsRHS;
16964 return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
16965 (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
16968 bool operator>(const Cost &RHS) const { return RHS < *this; }
16970 bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
16972 bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
16975 // The last instruction that represent the slice. This should be a
16976 // truncate instruction.
16979 // The original load instruction.
16980 LoadSDNode *Origin;
16982 // The right shift amount in bits from the original load.
16985 // The DAG from which Origin came from.
16986 // This is used to get some contextual information about legal types, etc.
16989 LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr,
16990 unsigned Shift = 0, SelectionDAG *DAG = nullptr)
16991 : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
16993 /// Get the bits used in a chunk of bits \p BitWidth large.
16994 /// \return Result is \p BitWidth and has used bits set to 1 and
16995 /// not used bits set to 0.
16996 APInt getUsedBits() const {
16997 // Reproduce the trunc(lshr) sequence:
16998 // - Start from the truncated value.
16999 // - Zero extend to the desired bit width.
17001 assert(Origin && "No original load to compare against.");
17002 unsigned BitWidth = Origin->getValueSizeInBits(0);
17003 assert(Inst && "This slice is not bound to an instruction");
17004 assert(Inst->getValueSizeInBits(0) <= BitWidth &&
17005 "Extracted slice is bigger than the whole type!");
17006 APInt UsedBits(Inst->getValueSizeInBits(0), 0);
17007 UsedBits.setAllBits();
17008 UsedBits = UsedBits.zext(BitWidth);
17009 UsedBits <<= Shift;
17013 /// Get the size of the slice to be loaded in bytes.
17014 unsigned getLoadedSize() const {
17015 unsigned SliceSize = getUsedBits().countPopulation();
17016 assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
17017 return SliceSize / 8;
17020 /// Get the type that will be loaded for this slice.
17021 /// Note: This may not be the final type for the slice.
17022 EVT getLoadedType() const {
17023 assert(DAG && "Missing context");
17024 LLVMContext &Ctxt = *DAG->getContext();
17025 return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
17028 /// Get the alignment of the load used for this slice.
17029 Align getAlign() const {
17030 Align Alignment = Origin->getAlign();
17031 uint64_t Offset = getOffsetFromBase();
17033 Alignment = commonAlignment(Alignment, Alignment.value() + Offset);
17037 /// Check if this slice can be rewritten with legal operations.
17038 bool isLegal() const {
17039 // An invalid slice is not legal.
17040 if (!Origin || !Inst || !DAG)
17043 // Offsets are for indexed load only, we do not handle that.
17044 if (!Origin->getOffset().isUndef())
17047 const TargetLowering &TLI = DAG->getTargetLoweringInfo();
17049 // Check that the type is legal.
17050 EVT SliceType = getLoadedType();
17051 if (!TLI.isTypeLegal(SliceType))
17054 // Check that the load is legal for this type.
17055 if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
17058 // Check that the offset can be computed.
17059 // 1. Check its type.
17060 EVT PtrType = Origin->getBasePtr().getValueType();
17061 if (PtrType == MVT::Untyped || PtrType.isExtended())
17064 // 2. Check that it fits in the immediate.
17065 if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
17068 // 3. Check that the computation is legal.
17069 if (!TLI.isOperationLegal(ISD::ADD, PtrType))
17072 // Check that the zext is legal if it needs one.
17073 EVT TruncateType = Inst->getValueType(0);
17074 if (TruncateType != SliceType &&
17075 !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
17081 /// Get the offset in bytes of this slice in the original chunk of
17083 /// \pre DAG != nullptr.
17084 uint64_t getOffsetFromBase() const {
17085 assert(DAG && "Missing context.");
17086 bool IsBigEndian = DAG->getDataLayout().isBigEndian();
17087 assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
17088 uint64_t Offset = Shift / 8;
17089 unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
17090 assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
17091 "The size of the original loaded type is not a multiple of a"
17093 // If Offset is bigger than TySizeInBytes, it means we are loading all
17094 // zeros. This should have been optimized before in the process.
17095 assert(TySizeInBytes > Offset &&
17096 "Invalid shift amount for given loaded size");
17098 Offset = TySizeInBytes - Offset - getLoadedSize();
17102 /// Generate the sequence of instructions to load the slice
17103 /// represented by this object and redirect the uses of this slice to
17104 /// this new sequence of instructions.
17105 /// \pre this->Inst && this->Origin are valid Instructions and this
17106 /// object passed the legal check: LoadedSlice::isLegal returned true.
17107 /// \return The last instruction of the sequence used to load the slice.
17108 SDValue loadSlice() const {
17109 assert(Inst && Origin && "Unable to replace a non-existing slice.");
17110 const SDValue &OldBaseAddr = Origin->getBasePtr();
17111 SDValue BaseAddr = OldBaseAddr;
17112 // Get the offset in that chunk of bytes w.r.t. the endianness.
17113 int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
17114 assert(Offset >= 0 && "Offset too big to fit in int64_t!");
17116 // BaseAddr = BaseAddr + Offset.
17117 EVT ArithType = BaseAddr.getValueType();
17119 BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
17120 DAG->getConstant(Offset, DL, ArithType));
17123 // Create the type of the loaded slice according to its size.
17124 EVT SliceType = getLoadedType();
17126 // Create the load for the slice.
17128 DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
17129 Origin->getPointerInfo().getWithOffset(Offset), getAlign(),
17130 Origin->getMemOperand()->getFlags());
17131 // If the final type is not the same as the loaded type, this means that
17132 // we have to pad with zero. Create a zero extend for that.
17133 EVT FinalType = Inst->getValueType(0);
17134 if (SliceType != FinalType)
17136 DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
17140 /// Check if this slice can be merged with an expensive cross register
17141 /// bank copy. E.g.,
17143 /// f = bitcast i32 i to float
17144 bool canMergeExpensiveCrossRegisterBankCopy() const {
17145 if (!Inst || !Inst->hasOneUse())
17147 SDNode *Use = *Inst->use_begin();
17148 if (Use->getOpcode() != ISD::BITCAST)
17150 assert(DAG && "Missing context");
17151 const TargetLowering &TLI = DAG->getTargetLoweringInfo();
17152 EVT ResVT = Use->getValueType(0);
17153 const TargetRegisterClass *ResRC =
17154 TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent());
17155 const TargetRegisterClass *ArgRC =
17156 TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(),
17157 Use->getOperand(0)->isDivergent());
17158 if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
17161 // At this point, we know that we perform a cross-register-bank copy.
17162 // Check if it is expensive.
17163 const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
17164 // Assume bitcasts are cheap, unless both register classes do not
17165 // explicitly share a common sub class.
17166 if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
17169 // Check if it will be merged with the load.
17170 // 1. Check the alignment / fast memory access constraint.
17171 bool IsFast = false;
17172 if (!TLI.allowsMemoryAccess(*DAG->getContext(), DAG->getDataLayout(), ResVT,
17173 Origin->getAddressSpace(), getAlign(),
17174 Origin->getMemOperand()->getFlags(), &IsFast) ||
17178 // 2. Check that the load is a legal operation for that type.
17179 if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
17182 // 3. Check that we do not have a zext in the way.
17183 if (Inst->getValueType(0) != getLoadedType())
17190 } // end anonymous namespace
17192 /// Check that all bits set in \p UsedBits form a dense region, i.e.,
17193 /// \p UsedBits looks like 0..0 1..1 0..0.
17194 static bool areUsedBitsDense(const APInt &UsedBits) {
17195 // If all the bits are one, this is dense!
17196 if (UsedBits.isAllOnes())
17199 // Get rid of the unused bits on the right.
17200 APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
17201 // Get rid of the unused bits on the left.
17202 if (NarrowedUsedBits.countLeadingZeros())
17203 NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
17204 // Check that the chunk of bits is completely used.
17205 return NarrowedUsedBits.isAllOnes();
17208 /// Check whether or not \p First and \p Second are next to each other
17209 /// in memory. This means that there is no hole between the bits loaded
17210 /// by \p First and the bits loaded by \p Second.
17211 static bool areSlicesNextToEachOther(const LoadedSlice &First,
17212 const LoadedSlice &Second) {
17213 assert(First.Origin == Second.Origin && First.Origin &&
17214 "Unable to match different memory origins.");
17215 APInt UsedBits = First.getUsedBits();
17216 assert((UsedBits & Second.getUsedBits()) == 0 &&
17217 "Slices are not supposed to overlap.");
17218 UsedBits |= Second.getUsedBits();
17219 return areUsedBitsDense(UsedBits);
17222 /// Adjust the \p GlobalLSCost according to the target
17223 /// paring capabilities and the layout of the slices.
17224 /// \pre \p GlobalLSCost should account for at least as many loads as
17225 /// there is in the slices in \p LoadedSlices.
17226 static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
17227 LoadedSlice::Cost &GlobalLSCost) {
17228 unsigned NumberOfSlices = LoadedSlices.size();
17229 // If there is less than 2 elements, no pairing is possible.
17230 if (NumberOfSlices < 2)
17233 // Sort the slices so that elements that are likely to be next to each
17234 // other in memory are next to each other in the list.
17235 llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
17236 assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
17237 return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
17239 const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
17240 // First (resp. Second) is the first (resp. Second) potentially candidate
17241 // to be placed in a paired load.
17242 const LoadedSlice *First = nullptr;
17243 const LoadedSlice *Second = nullptr;
17244 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
17245 // Set the beginning of the pair.
17247 Second = &LoadedSlices[CurrSlice];
17249 // If First is NULL, it means we start a new pair.
17250 // Get to the next slice.
17254 EVT LoadedType = First->getLoadedType();
17256 // If the types of the slices are different, we cannot pair them.
17257 if (LoadedType != Second->getLoadedType())
17260 // Check if the target supplies paired loads for this type.
17261 Align RequiredAlignment;
17262 if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
17263 // move to the next pair, this type is hopeless.
17267 // Check if we meet the alignment requirement.
17268 if (First->getAlign() < RequiredAlignment)
17271 // Check that both loads are next to each other in memory.
17272 if (!areSlicesNextToEachOther(*First, *Second))
17275 assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
17276 --GlobalLSCost.Loads;
17277 // Move to the next pair.
17282 /// Check the profitability of all involved LoadedSlice.
17283 /// Currently, it is considered profitable if there is exactly two
17284 /// involved slices (1) which are (2) next to each other in memory, and
17285 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
17287 /// Note: The order of the elements in \p LoadedSlices may be modified, but not
17288 /// the elements themselves.
17290 /// FIXME: When the cost model will be mature enough, we can relax
17291 /// constraints (1) and (2).
17292 static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
17293 const APInt &UsedBits, bool ForCodeSize) {
17294 unsigned NumberOfSlices = LoadedSlices.size();
17295 if (StressLoadSlicing)
17296 return NumberOfSlices > 1;
17299 if (NumberOfSlices != 2)
17303 if (!areUsedBitsDense(UsedBits))
17307 LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
17308 // The original code has one big load.
17309 OrigCost.Loads = 1;
17310 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
17311 const LoadedSlice &LS = LoadedSlices[CurrSlice];
17312 // Accumulate the cost of all the slices.
17313 LoadedSlice::Cost SliceCost(LS, ForCodeSize);
17314 GlobalSlicingCost += SliceCost;
17316 // Account as cost in the original configuration the gain obtained
17317 // with the current slices.
17318 OrigCost.addSliceGain(LS);
17321 // If the target supports paired load, adjust the cost accordingly.
17322 adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
17323 return OrigCost > GlobalSlicingCost;
17326 /// If the given load, \p LI, is used only by trunc or trunc(lshr)
17327 /// operations, split it in the various pieces being extracted.
17329 /// This sort of thing is introduced by SROA.
17330 /// This slicing takes care not to insert overlapping loads.
17331 /// \pre LI is a simple load (i.e., not an atomic or volatile load).
17332 bool DAGCombiner::SliceUpLoad(SDNode *N) {
17333 if (Level < AfterLegalizeDAG)
17336 LoadSDNode *LD = cast<LoadSDNode>(N);
17337 if (!LD->isSimple() || !ISD::isNormalLoad(LD) ||
17338 !LD->getValueType(0).isInteger())
17341 // The algorithm to split up a load of a scalable vector into individual
17342 // elements currently requires knowing the length of the loaded type,
17343 // so will need adjusting to work on scalable vectors.
17344 if (LD->getValueType(0).isScalableVector())
17347 // Keep track of already used bits to detect overlapping values.
17348 // In that case, we will just abort the transformation.
17349 APInt UsedBits(LD->getValueSizeInBits(0), 0);
17351 SmallVector<LoadedSlice, 4> LoadedSlices;
17353 // Check if this load is used as several smaller chunks of bits.
17354 // Basically, look for uses in trunc or trunc(lshr) and record a new chain
17355 // of computation for each trunc.
17356 for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
17357 UI != UIEnd; ++UI) {
17358 // Skip the uses of the chain.
17359 if (UI.getUse().getResNo() != 0)
17362 SDNode *User = *UI;
17363 unsigned Shift = 0;
17365 // Check if this is a trunc(lshr).
17366 if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
17367 isa<ConstantSDNode>(User->getOperand(1))) {
17368 Shift = User->getConstantOperandVal(1);
17369 User = *User->use_begin();
17372 // At this point, User is a Truncate, iff we encountered, trunc or
17374 if (User->getOpcode() != ISD::TRUNCATE)
17377 // The width of the type must be a power of 2 and greater than 8-bits.
17378 // Otherwise the load cannot be represented in LLVM IR.
17379 // Moreover, if we shifted with a non-8-bits multiple, the slice
17380 // will be across several bytes. We do not support that.
17381 unsigned Width = User->getValueSizeInBits(0);
17382 if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
17385 // Build the slice for this chain of computations.
17386 LoadedSlice LS(User, LD, Shift, &DAG);
17387 APInt CurrentUsedBits = LS.getUsedBits();
17389 // Check if this slice overlaps with another.
17390 if ((CurrentUsedBits & UsedBits) != 0)
17392 // Update the bits used globally.
17393 UsedBits |= CurrentUsedBits;
17395 // Check if the new slice would be legal.
17399 // Record the slice.
17400 LoadedSlices.push_back(LS);
17403 // Abort slicing if it does not seem to be profitable.
17404 if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
17409 // Rewrite each chain to use an independent load.
17410 // By construction, each chain can be represented by a unique load.
17412 // Prepare the argument for the new token factor for all the slices.
17413 SmallVector<SDValue, 8> ArgChains;
17414 for (const LoadedSlice &LS : LoadedSlices) {
17415 SDValue SliceInst = LS.loadSlice();
17416 CombineTo(LS.Inst, SliceInst, true);
17417 if (SliceInst.getOpcode() != ISD::LOAD)
17418 SliceInst = SliceInst.getOperand(0);
17419 assert(SliceInst->getOpcode() == ISD::LOAD &&
17420 "It takes more than a zext to get to the loaded slice!!");
17421 ArgChains.push_back(SliceInst.getValue(1));
17424 SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
17426 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
17427 AddToWorklist(Chain.getNode());
17431 /// Check to see if V is (and load (ptr), imm), where the load is having
17432 /// specific bytes cleared out. If so, return the byte size being masked out
17433 /// and the shift amount.
17434 static std::pair<unsigned, unsigned>
17435 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
17436 std::pair<unsigned, unsigned> Result(0, 0);
17438 // Check for the structure we're looking for.
17439 if (V->getOpcode() != ISD::AND ||
17440 !isa<ConstantSDNode>(V->getOperand(1)) ||
17441 !ISD::isNormalLoad(V->getOperand(0).getNode()))
17444 // Check the chain and pointer.
17445 LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
17446 if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer.
17448 // This only handles simple types.
17449 if (V.getValueType() != MVT::i16 &&
17450 V.getValueType() != MVT::i32 &&
17451 V.getValueType() != MVT::i64)
17454 // Check the constant mask. Invert it so that the bits being masked out are
17455 // 0 and the bits being kept are 1. Use getSExtValue so that leading bits
17456 // follow the sign bit for uniformity.
17457 uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
17458 unsigned NotMaskLZ = countLeadingZeros(NotMask);
17459 if (NotMaskLZ & 7) return Result; // Must be multiple of a byte.
17460 unsigned NotMaskTZ = countTrailingZeros(NotMask);
17461 if (NotMaskTZ & 7) return Result; // Must be multiple of a byte.
17462 if (NotMaskLZ == 64) return Result; // All zero mask.
17464 // See if we have a continuous run of bits. If so, we have 0*1+0*
17465 if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
17468 // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
17469 if (V.getValueType() != MVT::i64 && NotMaskLZ)
17470 NotMaskLZ -= 64-V.getValueSizeInBits();
17472 unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
17473 switch (MaskedBytes) {
17477 default: return Result; // All one mask, or 5-byte mask.
17480 // Verify that the first bit starts at a multiple of mask so that the access
17481 // is aligned the same as the access width.
17482 if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
17484 // For narrowing to be valid, it must be the case that the load the
17485 // immediately preceding memory operation before the store.
17486 if (LD == Chain.getNode())
17488 else if (Chain->getOpcode() == ISD::TokenFactor &&
17489 SDValue(LD, 1).hasOneUse()) {
17490 // LD has only 1 chain use so they are no indirect dependencies.
17491 if (!LD->isOperandOf(Chain.getNode()))
17494 return Result; // Fail.
17496 Result.first = MaskedBytes;
17497 Result.second = NotMaskTZ/8;
17501 /// Check to see if IVal is something that provides a value as specified by
17502 /// MaskInfo. If so, replace the specified store with a narrower store of
17503 /// truncated IVal.
17505 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
17506 SDValue IVal, StoreSDNode *St,
17508 unsigned NumBytes = MaskInfo.first;
17509 unsigned ByteShift = MaskInfo.second;
17510 SelectionDAG &DAG = DC->getDAG();
17512 // Check to see if IVal is all zeros in the part being masked in by the 'or'
17513 // that uses this. If not, this is not a replacement.
17514 APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
17515 ByteShift*8, (ByteShift+NumBytes)*8);
17516 if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue();
17518 // Check that it is legal on the target to do this. It is legal if the new
17519 // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
17520 // legalization. If the source type is legal, but the store type isn't, see
17521 // if we can use a truncating store.
17522 MVT VT = MVT::getIntegerVT(NumBytes * 8);
17523 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17524 bool UseTruncStore;
17525 if (DC->isTypeLegal(VT))
17526 UseTruncStore = false;
17527 else if (TLI.isTypeLegal(IVal.getValueType()) &&
17528 TLI.isTruncStoreLegal(IVal.getValueType(), VT))
17529 UseTruncStore = true;
17532 // Check that the target doesn't think this is a bad idea.
17533 if (St->getMemOperand() &&
17534 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
17535 *St->getMemOperand()))
17538 // Okay, we can do this! Replace the 'St' store with a store of IVal that is
17539 // shifted by ByteShift and truncated down to NumBytes.
17542 IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
17543 DAG.getConstant(ByteShift*8, DL,
17544 DC->getShiftAmountTy(IVal.getValueType())));
17547 // Figure out the offset for the store and the alignment of the access.
17549 if (DAG.getDataLayout().isLittleEndian())
17550 StOffset = ByteShift;
17552 StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
17554 SDValue Ptr = St->getBasePtr();
17557 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL);
17562 return DAG.getTruncStore(St->getChain(), SDLoc(St), IVal, Ptr,
17563 St->getPointerInfo().getWithOffset(StOffset),
17564 VT, St->getOriginalAlign());
17566 // Truncate down to the new size.
17567 IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
17570 .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
17571 St->getPointerInfo().getWithOffset(StOffset),
17572 St->getOriginalAlign());
17575 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
17576 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
17577 /// narrowing the load and store if it would end up being a win for performance
17579 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
17580 StoreSDNode *ST = cast<StoreSDNode>(N);
17581 if (!ST->isSimple())
17584 SDValue Chain = ST->getChain();
17585 SDValue Value = ST->getValue();
17586 SDValue Ptr = ST->getBasePtr();
17587 EVT VT = Value.getValueType();
17589 if (ST->isTruncatingStore() || VT.isVector())
17592 unsigned Opc = Value.getOpcode();
17594 if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
17595 !Value.hasOneUse())
17598 // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
17599 // is a byte mask indicating a consecutive number of bytes, check to see if
17600 // Y is known to provide just those bytes. If so, we try to replace the
17601 // load + replace + store sequence with a single (narrower) store, which makes
17603 if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {
17604 std::pair<unsigned, unsigned> MaskedLoad;
17605 MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
17606 if (MaskedLoad.first)
17607 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
17608 Value.getOperand(1), ST,this))
17611 // Or is commutative, so try swapping X and Y.
17612 MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
17613 if (MaskedLoad.first)
17614 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
17615 Value.getOperand(0), ST,this))
17619 if (!EnableReduceLoadOpStoreWidth)
17622 if (Value.getOperand(1).getOpcode() != ISD::Constant)
17625 SDValue N0 = Value.getOperand(0);
17626 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
17627 Chain == SDValue(N0.getNode(), 1)) {
17628 LoadSDNode *LD = cast<LoadSDNode>(N0);
17629 if (LD->getBasePtr() != Ptr ||
17630 LD->getPointerInfo().getAddrSpace() !=
17631 ST->getPointerInfo().getAddrSpace())
17634 // Find the type to narrow it the load / op / store to.
17635 SDValue N1 = Value.getOperand(1);
17636 unsigned BitWidth = N1.getValueSizeInBits();
17637 APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
17638 if (Opc == ISD::AND)
17639 Imm ^= APInt::getAllOnes(BitWidth);
17640 if (Imm == 0 || Imm.isAllOnes())
17642 unsigned ShAmt = Imm.countTrailingZeros();
17643 unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
17644 unsigned NewBW = NextPowerOf2(MSB - ShAmt);
17645 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
17646 // The narrowing should be profitable, the load/store operation should be
17647 // legal (or custom) and the store size should be equal to the NewVT width.
17648 while (NewBW < BitWidth &&
17649 (NewVT.getStoreSizeInBits() != NewBW ||
17650 !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
17651 !TLI.isNarrowingProfitable(VT, NewVT))) {
17652 NewBW = NextPowerOf2(NewBW);
17653 NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
17655 if (NewBW >= BitWidth)
17658 // If the lsb changed does not start at the type bitwidth boundary,
17659 // start at the previous one.
17661 ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
17662 APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
17663 std::min(BitWidth, ShAmt + NewBW));
17664 if ((Imm & Mask) == Imm) {
17665 APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
17666 if (Opc == ISD::AND)
17667 NewImm ^= APInt::getAllOnes(NewBW);
17668 uint64_t PtrOff = ShAmt / 8;
17669 // For big endian targets, we need to adjust the offset to the pointer to
17670 // load the correct bytes.
17671 if (DAG.getDataLayout().isBigEndian())
17672 PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
17674 bool IsFast = false;
17675 Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
17676 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
17677 LD->getAddressSpace(), NewAlign,
17678 LD->getMemOperand()->getFlags(), &IsFast) ||
17683 DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(PtrOff), SDLoc(LD));
17685 DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
17686 LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
17687 LD->getMemOperand()->getFlags(), LD->getAAInfo());
17688 SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
17689 DAG.getConstant(NewImm, SDLoc(Value),
17692 DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
17693 ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
17695 AddToWorklist(NewPtr.getNode());
17696 AddToWorklist(NewLD.getNode());
17697 AddToWorklist(NewVal.getNode());
17698 WorklistRemover DeadNodes(*this);
17699 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
17708 /// For a given floating point load / store pair, if the load value isn't used
17709 /// by any other operations, then consider transforming the pair to integer
17710 /// load / store operations if the target deems the transformation profitable.
17711 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
17712 StoreSDNode *ST = cast<StoreSDNode>(N);
17713 SDValue Value = ST->getValue();
17714 if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
17715 Value.hasOneUse()) {
17716 LoadSDNode *LD = cast<LoadSDNode>(Value);
17717 EVT VT = LD->getMemoryVT();
17718 if (!VT.isFloatingPoint() ||
17719 VT != ST->getMemoryVT() ||
17720 LD->isNonTemporal() ||
17721 ST->isNonTemporal() ||
17722 LD->getPointerInfo().getAddrSpace() != 0 ||
17723 ST->getPointerInfo().getAddrSpace() != 0)
17726 TypeSize VTSize = VT.getSizeInBits();
17728 // We don't know the size of scalable types at compile time so we cannot
17729 // create an integer of the equivalent size.
17730 if (VTSize.isScalable())
17733 bool FastLD = false, FastST = false;
17734 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize());
17735 if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
17736 !TLI.isOperationLegal(ISD::STORE, IntVT) ||
17737 !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
17738 !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT) ||
17739 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
17740 *LD->getMemOperand(), &FastLD) ||
17741 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
17742 *ST->getMemOperand(), &FastST) ||
17743 !FastLD || !FastST)
17747 DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
17748 LD->getPointerInfo(), LD->getAlign());
17751 DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
17752 ST->getPointerInfo(), ST->getAlign());
17754 AddToWorklist(NewLD.getNode());
17755 AddToWorklist(NewST.getNode());
17756 WorklistRemover DeadNodes(*this);
17757 DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
17765 // This is a helper function for visitMUL to check the profitability
17766 // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
17767 // MulNode is the original multiply, AddNode is (add x, c1),
17768 // and ConstNode is c2.
17770 // If the (add x, c1) has multiple uses, we could increase
17771 // the number of adds if we make this transformation.
17772 // It would only be worth doing this if we can remove a
17773 // multiply in the process. Check for that here.
17777 // We're checking for cases where we have common "c3 * A" expressions.
17778 bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
17779 SDValue ConstNode) {
17782 // If the add only has one use, and the target thinks the folding is
17783 // profitable or does not lead to worse code, this would be OK to do.
17784 if (AddNode->hasOneUse() &&
17785 TLI.isMulAddWithConstProfitable(AddNode, ConstNode))
17788 // Walk all the users of the constant with which we're multiplying.
17789 for (SDNode *Use : ConstNode->uses()) {
17790 if (Use == MulNode) // This use is the one we're on right now. Skip it.
17793 if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
17795 SDNode *MulVar = AddNode.getOperand(0).getNode();
17797 // OtherOp is what we're multiplying against the constant.
17798 if (Use->getOperand(0) == ConstNode)
17799 OtherOp = Use->getOperand(1).getNode();
17801 OtherOp = Use->getOperand(0).getNode();
17803 // Check to see if multiply is with the same operand of our "add".
17805 // ConstNode = CONST
17806 // Use = ConstNode * A <-- visiting Use. OtherOp is A.
17808 // AddNode = (A + c1) <-- MulVar is A.
17809 // = AddNode * ConstNode <-- current visiting instruction.
17811 // If we make this transformation, we will have a common
17812 // multiply (ConstNode * A) that we can save.
17813 if (OtherOp == MulVar)
17816 // Now check to see if a future expansion will give us a common
17819 // ConstNode = CONST
17820 // AddNode = (A + c1)
17821 // ... = AddNode * ConstNode <-- current visiting instruction.
17823 // OtherOp = (A + c2)
17824 // Use = OtherOp * ConstNode <-- visiting Use.
17826 // If we make this transformation, we will have a common
17827 // multiply (CONST * A) after we also do the same transformation
17828 // to the "t2" instruction.
17829 if (OtherOp->getOpcode() == ISD::ADD &&
17830 DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
17831 OtherOp->getOperand(0).getNode() == MulVar)
17836 // Didn't find a case where this would be profitable.
17840 SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
17841 unsigned NumStores) {
17842 SmallVector<SDValue, 8> Chains;
17843 SmallPtrSet<const SDNode *, 8> Visited;
17844 SDLoc StoreDL(StoreNodes[0].MemNode);
17846 for (unsigned i = 0; i < NumStores; ++i) {
17847 Visited.insert(StoreNodes[i].MemNode);
17850 // don't include nodes that are children or repeated nodes.
17851 for (unsigned i = 0; i < NumStores; ++i) {
17852 if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second)
17853 Chains.push_back(StoreNodes[i].MemNode->getChain());
17856 assert(Chains.size() > 0 && "Chain should have generated a chain");
17857 return DAG.getTokenFactor(StoreDL, Chains);
17860 bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
17861 SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
17862 bool IsConstantSrc, bool UseVector, bool UseTrunc) {
17863 // Make sure we have something to merge.
17867 assert((!UseTrunc || !UseVector) &&
17868 "This optimization cannot emit a vector truncating store");
17870 // The latest Node in the DAG.
17871 SDLoc DL(StoreNodes[0].MemNode);
17873 TypeSize ElementSizeBits = MemVT.getStoreSizeInBits();
17874 unsigned SizeInBits = NumStores * ElementSizeBits;
17875 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
17877 Optional<MachineMemOperand::Flags> Flags;
17879 for (unsigned I = 0; I != NumStores; ++I) {
17880 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
17882 Flags = St->getMemOperand()->getFlags();
17883 AAInfo = St->getAAInfo();
17886 // Skip merging if there's an inconsistent flag.
17887 if (Flags != St->getMemOperand()->getFlags())
17889 // Concatenate AA metadata.
17890 AAInfo = AAInfo.concat(St->getAAInfo());
17895 unsigned Elts = NumStores * NumMemElts;
17896 // Get the type for the merged vector store.
17897 StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
17899 StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
17903 if (IsConstantSrc) {
17904 SmallVector<SDValue, 8> BuildVector;
17905 for (unsigned I = 0; I != NumStores; ++I) {
17906 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
17907 SDValue Val = St->getValue();
17908 // If constant is of the wrong type, convert it now.
17909 if (MemVT != Val.getValueType()) {
17910 Val = peekThroughBitcasts(Val);
17911 // Deal with constants of wrong size.
17912 if (ElementSizeBits != Val.getValueSizeInBits()) {
17914 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
17915 if (isa<ConstantFPSDNode>(Val)) {
17916 // Not clear how to truncate FP values.
17920 if (auto *C = dyn_cast<ConstantSDNode>(Val))
17921 Val = DAG.getConstant(C->getAPIntValue()
17922 .zextOrTrunc(Val.getValueSizeInBits())
17923 .zextOrTrunc(ElementSizeBits),
17924 SDLoc(C), IntMemVT);
17926 // Make sure correctly size type is the correct type.
17927 Val = DAG.getBitcast(MemVT, Val);
17929 BuildVector.push_back(Val);
17931 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
17932 : ISD::BUILD_VECTOR,
17933 DL, StoreTy, BuildVector);
17935 SmallVector<SDValue, 8> Ops;
17936 for (unsigned i = 0; i < NumStores; ++i) {
17937 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
17938 SDValue Val = peekThroughBitcasts(St->getValue());
17939 // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
17940 // type MemVT. If the underlying value is not the correct
17941 // type, but it is an extraction of an appropriate vector we
17942 // can recast Val to be of the correct type. This may require
17943 // converting between EXTRACT_VECTOR_ELT and
17944 // EXTRACT_SUBVECTOR.
17945 if ((MemVT != Val.getValueType()) &&
17946 (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
17947 Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
17948 EVT MemVTScalarTy = MemVT.getScalarType();
17949 // We may need to add a bitcast here to get types to line up.
17950 if (MemVTScalarTy != Val.getValueType().getScalarType()) {
17951 Val = DAG.getBitcast(MemVT, Val);
17953 unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR
17954 : ISD::EXTRACT_VECTOR_ELT;
17955 SDValue Vec = Val.getOperand(0);
17956 SDValue Idx = Val.getOperand(1);
17957 Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx);
17960 Ops.push_back(Val);
17963 // Build the extracted vector elements back into a vector.
17964 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
17965 : ISD::BUILD_VECTOR,
17969 // We should always use a vector store when merging extracted vector
17970 // elements, so this path implies a store of constants.
17971 assert(IsConstantSrc && "Merged vector elements should use vector store");
17973 APInt StoreInt(SizeInBits, 0);
17975 // Construct a single integer constant which is made of the smaller
17976 // constant inputs.
17977 bool IsLE = DAG.getDataLayout().isLittleEndian();
17978 for (unsigned i = 0; i < NumStores; ++i) {
17979 unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
17980 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
17982 SDValue Val = St->getValue();
17983 Val = peekThroughBitcasts(Val);
17984 StoreInt <<= ElementSizeBits;
17985 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
17986 StoreInt |= C->getAPIntValue()
17987 .zextOrTrunc(ElementSizeBits)
17988 .zextOrTrunc(SizeInBits);
17989 } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
17990 StoreInt |= C->getValueAPF()
17992 .zextOrTrunc(ElementSizeBits)
17993 .zextOrTrunc(SizeInBits);
17994 // If fp truncation is necessary give up for now.
17995 if (MemVT.getSizeInBits() != ElementSizeBits)
17998 llvm_unreachable("Invalid constant element type");
18002 // Create the new Load and Store operations.
18003 StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
18006 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18007 SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
18009 // make sure we use trunc store if it's necessary to be legal.
18012 NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
18013 FirstInChain->getPointerInfo(),
18014 FirstInChain->getAlign(), *Flags, AAInfo);
18015 } else { // Must be realized as a trunc store
18016 EVT LegalizedStoredValTy =
18017 TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
18018 unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits();
18019 ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
18020 SDValue ExtendedStoreVal =
18021 DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
18022 LegalizedStoredValTy);
18023 NewStore = DAG.getTruncStore(
18024 NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
18025 FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
18026 FirstInChain->getAlign(), *Flags, AAInfo);
18029 // Replace all merged stores with the new store.
18030 for (unsigned i = 0; i < NumStores; ++i)
18031 CombineTo(StoreNodes[i].MemNode, NewStore);
18033 AddToWorklist(NewChain.getNode());
18037 void DAGCombiner::getStoreMergeCandidates(
18038 StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
18039 SDNode *&RootNode) {
18040 // This holds the base pointer, index, and the offset in bytes from the base
18041 // pointer. We must have a base and an offset. Do not handle stores to undef
18043 BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
18044 if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef())
18047 SDValue Val = peekThroughBitcasts(St->getValue());
18048 StoreSource StoreSrc = getStoreSource(Val);
18049 assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
18051 // Match on loadbaseptr if relevant.
18052 EVT MemVT = St->getMemoryVT();
18053 BaseIndexOffset LBasePtr;
18055 if (StoreSrc == StoreSource::Load) {
18056 auto *Ld = cast<LoadSDNode>(Val);
18057 LBasePtr = BaseIndexOffset::match(Ld, DAG);
18058 LoadVT = Ld->getMemoryVT();
18059 // Load and store should be the same type.
18060 if (MemVT != LoadVT)
18062 // Loads must only have one use.
18063 if (!Ld->hasNUsesOfValue(1, 0))
18065 // The memory operands must not be volatile/indexed/atomic.
18066 // TODO: May be able to relax for unordered atomics (see D66309)
18067 if (!Ld->isSimple() || Ld->isIndexed())
18070 auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
18071 int64_t &Offset) -> bool {
18072 // The memory operands must not be volatile/indexed/atomic.
18073 // TODO: May be able to relax for unordered atomics (see D66309)
18074 if (!Other->isSimple() || Other->isIndexed())
18076 // Don't mix temporal stores with non-temporal stores.
18077 if (St->isNonTemporal() != Other->isNonTemporal())
18079 SDValue OtherBC = peekThroughBitcasts(Other->getValue());
18080 // Allow merging constants of different types as integers.
18081 bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
18082 : Other->getMemoryVT() != MemVT;
18083 switch (StoreSrc) {
18084 case StoreSource::Load: {
18087 // The Load's Base Ptr must also match.
18088 auto *OtherLd = dyn_cast<LoadSDNode>(OtherBC);
18091 BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
18092 if (LoadVT != OtherLd->getMemoryVT())
18094 // Loads must only have one use.
18095 if (!OtherLd->hasNUsesOfValue(1, 0))
18097 // The memory operands must not be volatile/indexed/atomic.
18098 // TODO: May be able to relax for unordered atomics (see D66309)
18099 if (!OtherLd->isSimple() || OtherLd->isIndexed())
18101 // Don't mix temporal loads with non-temporal loads.
18102 if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
18104 if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
18108 case StoreSource::Constant:
18111 if (!isIntOrFPConstant(OtherBC))
18114 case StoreSource::Extract:
18115 // Do not merge truncated stores here.
18116 if (Other->isTruncatingStore())
18118 if (!MemVT.bitsEq(OtherBC.getValueType()))
18120 if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
18121 OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18125 llvm_unreachable("Unhandled store source for merging");
18127 Ptr = BaseIndexOffset::match(Other, DAG);
18128 return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
18131 // Check if the pair of StoreNode and the RootNode already bail out many
18132 // times which is over the limit in dependence check.
18133 auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
18134 SDNode *RootNode) -> bool {
18135 auto RootCount = StoreRootCountMap.find(StoreNode);
18136 return RootCount != StoreRootCountMap.end() &&
18137 RootCount->second.first == RootNode &&
18138 RootCount->second.second > StoreMergeDependenceLimit;
18141 auto TryToAddCandidate = [&](SDNode::use_iterator UseIter) {
18142 // This must be a chain use.
18143 if (UseIter.getOperandNo() != 0)
18145 if (auto *OtherStore = dyn_cast<StoreSDNode>(*UseIter)) {
18146 BaseIndexOffset Ptr;
18148 if (CandidateMatch(OtherStore, Ptr, PtrDiff) &&
18149 !OverLimitInDependenceCheck(OtherStore, RootNode))
18150 StoreNodes.push_back(MemOpLink(OtherStore, PtrDiff));
18154 // We looking for a root node which is an ancestor to all mergable
18155 // stores. We search up through a load, to our root and then down
18156 // through all children. For instance we will find Store{1,2,3} if
18157 // St is Store1, Store2. or Store3 where the root is not a load
18158 // which always true for nonvolatile ops. TODO: Expand
18159 // the search to find all valid candidates through multiple layers of loads.
18162 // |-------|-------|
18163 // Load Load Store3
18167 // FIXME: We should be able to climb and
18168 // descend TokenFactors to find candidates as well.
18170 RootNode = St->getChain().getNode();
18172 unsigned NumNodesExplored = 0;
18173 const unsigned MaxSearchNodes = 1024;
18174 if (auto *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
18175 RootNode = Ldn->getChain().getNode();
18176 for (auto I = RootNode->use_begin(), E = RootNode->use_end();
18177 I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) {
18178 if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain
18179 for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
18180 TryToAddCandidate(I2);
18182 // Check stores that depend on the root (e.g. Store 3 in the chart above).
18183 if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) {
18184 TryToAddCandidate(I);
18188 for (auto I = RootNode->use_begin(), E = RootNode->use_end();
18189 I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored)
18190 TryToAddCandidate(I);
18194 // We need to check that merging these stores does not cause a loop in the
18195 // DAG. Any store candidate may depend on another candidate indirectly through
18196 // its operands. Check in parallel by searching up from operands of candidates.
18197 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
18198 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
18199 SDNode *RootNode) {
18200 // FIXME: We should be able to truncate a full search of
18201 // predecessors by doing a BFS and keeping tabs the originating
18202 // stores from which worklist nodes come from in a similar way to
18203 // TokenFactor simplfication.
18205 SmallPtrSet<const SDNode *, 32> Visited;
18206 SmallVector<const SDNode *, 8> Worklist;
18208 // RootNode is a predecessor to all candidates so we need not search
18209 // past it. Add RootNode (peeking through TokenFactors). Do not count
18210 // these towards size check.
18212 Worklist.push_back(RootNode);
18213 while (!Worklist.empty()) {
18214 auto N = Worklist.pop_back_val();
18215 if (!Visited.insert(N).second)
18216 continue; // Already present in Visited.
18217 if (N->getOpcode() == ISD::TokenFactor) {
18218 for (SDValue Op : N->ops())
18219 Worklist.push_back(Op.getNode());
18223 // Don't count pruning nodes towards max.
18224 unsigned int Max = 1024 + Visited.size();
18225 // Search Ops of store candidates.
18226 for (unsigned i = 0; i < NumStores; ++i) {
18227 SDNode *N = StoreNodes[i].MemNode;
18228 // Of the 4 Store Operands:
18229 // * Chain (Op 0) -> We have already considered these
18230 // in candidate selection, but only by following the
18231 // chain dependencies. We could still have a chain
18232 // dependency to a load, that has a non-chain dep to
18233 // another load, that depends on a store, etc. So it is
18234 // possible to have dependencies that consist of a mix
18235 // of chain and non-chain deps, and we need to include
18236 // chain operands in the analysis here..
18237 // * Value (Op 1) -> Cycles may happen (e.g. through load chains)
18238 // * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
18239 // but aren't necessarily fromt the same base node, so
18240 // cycles possible (e.g. via indexed store).
18241 // * (Op 3) -> Represents the pre or post-indexing offset (or undef for
18242 // non-indexed stores). Not constant on all targets (e.g. ARM)
18243 // and so can participate in a cycle.
18244 for (unsigned j = 0; j < N->getNumOperands(); ++j)
18245 Worklist.push_back(N->getOperand(j).getNode());
18247 // Search through DAG. We can stop early if we find a store node.
18248 for (unsigned i = 0; i < NumStores; ++i)
18249 if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
18251 // If the searching bail out, record the StoreNode and RootNode in the
18252 // StoreRootCountMap. If we have seen the pair many times over a limit,
18253 // we won't add the StoreNode into StoreNodes set again.
18254 if (Visited.size() >= Max) {
18255 auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode];
18256 if (RootCount.first == RootNode)
18257 RootCount.second++;
18259 RootCount = {RootNode, 1};
18267 DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
18268 int64_t ElementSizeBytes) const {
18270 // Find a store past the width of the first store.
18271 size_t StartIdx = 0;
18272 while ((StartIdx + 1 < StoreNodes.size()) &&
18273 StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
18274 StoreNodes[StartIdx + 1].OffsetFromBase)
18277 // Bail if we don't have enough candidates to merge.
18278 if (StartIdx + 1 >= StoreNodes.size())
18281 // Trim stores that overlapped with the first store.
18283 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
18285 // Scan the memory operations on the chain and find the first
18286 // non-consecutive store memory address.
18287 unsigned NumConsecutiveStores = 1;
18288 int64_t StartAddress = StoreNodes[0].OffsetFromBase;
18289 // Check that the addresses are consecutive starting from the second
18290 // element in the list of stores.
18291 for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
18292 int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
18293 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
18295 NumConsecutiveStores = i + 1;
18297 if (NumConsecutiveStores > 1)
18298 return NumConsecutiveStores;
18300 // There are no consecutive stores at the start of the list.
18301 // Remove the first store and try again.
18302 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
18306 bool DAGCombiner::tryStoreMergeOfConstants(
18307 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
18308 EVT MemVT, SDNode *RootNode, bool AllowVectors) {
18309 LLVMContext &Context = *DAG.getContext();
18310 const DataLayout &DL = DAG.getDataLayout();
18311 int64_t ElementSizeBytes = MemVT.getStoreSize();
18312 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
18313 bool MadeChange = false;
18315 // Store the constants into memory as one consecutive store.
18316 while (NumConsecutiveStores >= 2) {
18317 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18318 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
18319 Align FirstStoreAlign = FirstInChain->getAlign();
18320 unsigned LastLegalType = 1;
18321 unsigned LastLegalVectorType = 1;
18322 bool LastIntegerTrunc = false;
18323 bool NonZero = false;
18324 unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
18325 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
18326 StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
18327 SDValue StoredVal = ST->getValue();
18328 bool IsElementZero = false;
18329 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
18330 IsElementZero = C->isZero();
18331 else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
18332 IsElementZero = C->getConstantFPValue()->isNullValue();
18333 if (IsElementZero) {
18334 if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
18335 FirstZeroAfterNonZero = i;
18337 NonZero |= !IsElementZero;
18339 // Find a legal type for the constant store.
18340 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
18341 EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
18342 bool IsFast = false;
18344 // Break early when size is too large to be legal.
18345 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
18348 if (TLI.isTypeLegal(StoreTy) &&
18349 TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
18350 DAG.getMachineFunction()) &&
18351 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18352 *FirstInChain->getMemOperand(), &IsFast) &&
18354 LastIntegerTrunc = false;
18355 LastLegalType = i + 1;
18356 // Or check whether a truncstore is legal.
18357 } else if (TLI.getTypeAction(Context, StoreTy) ==
18358 TargetLowering::TypePromoteInteger) {
18359 EVT LegalizedStoredValTy =
18360 TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
18361 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
18362 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
18363 DAG.getMachineFunction()) &&
18364 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18365 *FirstInChain->getMemOperand(), &IsFast) &&
18367 LastIntegerTrunc = true;
18368 LastLegalType = i + 1;
18372 // We only use vectors if the constant is known to be zero or the
18373 // target allows it and the function is not marked with the
18374 // noimplicitfloat attribute.
18376 TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
18378 // Find a legal type for the vector store.
18379 unsigned Elts = (i + 1) * NumMemElts;
18380 EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
18381 if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
18382 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
18383 TLI.allowsMemoryAccess(Context, DL, Ty,
18384 *FirstInChain->getMemOperand(), &IsFast) &&
18386 LastLegalVectorType = i + 1;
18390 bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors;
18391 unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
18392 bool UseTrunc = LastIntegerTrunc && !UseVector;
18394 // Check if we found a legal integer type that creates a meaningful
18397 // We know that candidate stores are in order and of correct
18398 // shape. While there is no mergeable sequence from the
18399 // beginning one may start later in the sequence. The only
18400 // reason a merge of size N could have failed where another of
18401 // the same size would not have, is if the alignment has
18402 // improved or we've dropped a non-zero value. Drop as many
18403 // candidates as we can here.
18404 unsigned NumSkip = 1;
18405 while ((NumSkip < NumConsecutiveStores) &&
18406 (NumSkip < FirstZeroAfterNonZero) &&
18407 (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
18410 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
18411 NumConsecutiveStores -= NumSkip;
18415 // Check that we can merge these candidates without causing a cycle.
18416 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
18418 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18419 NumConsecutiveStores -= NumElem;
18423 MadeChange |= mergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
18424 /*IsConstantSrc*/ true,
18425 UseVector, UseTrunc);
18427 // Remove merged stores for next iteration.
18428 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18429 NumConsecutiveStores -= NumElem;
18434 bool DAGCombiner::tryStoreMergeOfExtracts(
18435 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
18436 EVT MemVT, SDNode *RootNode) {
18437 LLVMContext &Context = *DAG.getContext();
18438 const DataLayout &DL = DAG.getDataLayout();
18439 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
18440 bool MadeChange = false;
18442 // Loop on Consecutive Stores on success.
18443 while (NumConsecutiveStores >= 2) {
18444 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18445 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
18446 Align FirstStoreAlign = FirstInChain->getAlign();
18447 unsigned NumStoresToMerge = 1;
18448 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
18449 // Find a legal type for the vector store.
18450 unsigned Elts = (i + 1) * NumMemElts;
18451 EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
18452 bool IsFast = false;
18454 // Break early when size is too large to be legal.
18455 if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
18458 if (TLI.isTypeLegal(Ty) &&
18459 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
18460 TLI.allowsMemoryAccess(Context, DL, Ty,
18461 *FirstInChain->getMemOperand(), &IsFast) &&
18463 NumStoresToMerge = i + 1;
18466 // Check if we found a legal integer type creating a meaningful
18468 if (NumStoresToMerge < 2) {
18469 // We know that candidate stores are in order and of correct
18470 // shape. While there is no mergeable sequence from the
18471 // beginning one may start later in the sequence. The only
18472 // reason a merge of size N could have failed where another of
18473 // the same size would not have, is if the alignment has
18474 // improved. Drop as many candidates as we can here.
18475 unsigned NumSkip = 1;
18476 while ((NumSkip < NumConsecutiveStores) &&
18477 (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
18480 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
18481 NumConsecutiveStores -= NumSkip;
18485 // Check that we can merge these candidates without causing a cycle.
18486 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge,
18488 StoreNodes.erase(StoreNodes.begin(),
18489 StoreNodes.begin() + NumStoresToMerge);
18490 NumConsecutiveStores -= NumStoresToMerge;
18494 MadeChange |= mergeStoresOfConstantsOrVecElts(
18495 StoreNodes, MemVT, NumStoresToMerge, /*IsConstantSrc*/ false,
18496 /*UseVector*/ true, /*UseTrunc*/ false);
18498 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge);
18499 NumConsecutiveStores -= NumStoresToMerge;
18504 bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
18505 unsigned NumConsecutiveStores, EVT MemVT,
18506 SDNode *RootNode, bool AllowVectors,
18507 bool IsNonTemporalStore,
18508 bool IsNonTemporalLoad) {
18509 LLVMContext &Context = *DAG.getContext();
18510 const DataLayout &DL = DAG.getDataLayout();
18511 int64_t ElementSizeBytes = MemVT.getStoreSize();
18512 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
18513 bool MadeChange = false;
18515 // Look for load nodes which are used by the stored values.
18516 SmallVector<MemOpLink, 8> LoadNodes;
18518 // Find acceptable loads. Loads need to have the same chain (token factor),
18519 // must not be zext, volatile, indexed, and they must be consecutive.
18520 BaseIndexOffset LdBasePtr;
18522 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
18523 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
18524 SDValue Val = peekThroughBitcasts(St->getValue());
18525 LoadSDNode *Ld = cast<LoadSDNode>(Val);
18527 BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
18528 // If this is not the first ptr that we check.
18529 int64_t LdOffset = 0;
18530 if (LdBasePtr.getBase().getNode()) {
18531 // The base ptr must be the same.
18532 if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
18535 // Check that all other base pointers are the same as this one.
18539 // We found a potential memory operand to merge.
18540 LoadNodes.push_back(MemOpLink(Ld, LdOffset));
18543 while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
18544 Align RequiredAlignment;
18545 bool NeedRotate = false;
18546 if (LoadNodes.size() == 2) {
18547 // If we have load/store pair instructions and we only have two values,
18548 // don't bother merging.
18549 if (TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
18550 StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) {
18551 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
18552 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
18555 // If the loads are reversed, see if we can rotate the halves into place.
18556 int64_t Offset0 = LoadNodes[0].OffsetFromBase;
18557 int64_t Offset1 = LoadNodes[1].OffsetFromBase;
18558 EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2);
18559 if (Offset0 - Offset1 == ElementSizeBytes &&
18560 (hasOperation(ISD::ROTL, PairVT) ||
18561 hasOperation(ISD::ROTR, PairVT))) {
18562 std::swap(LoadNodes[0], LoadNodes[1]);
18566 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18567 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
18568 Align FirstStoreAlign = FirstInChain->getAlign();
18569 LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
18571 // Scan the memory operations on the chain and find the first
18572 // non-consecutive load memory address. These variables hold the index in
18573 // the store node array.
18575 unsigned LastConsecutiveLoad = 1;
18577 // This variable refers to the size and not index in the array.
18578 unsigned LastLegalVectorType = 1;
18579 unsigned LastLegalIntegerType = 1;
18580 bool isDereferenceable = true;
18581 bool DoIntegerTruncate = false;
18582 int64_t StartAddress = LoadNodes[0].OffsetFromBase;
18583 SDValue LoadChain = FirstLoad->getChain();
18584 for (unsigned i = 1; i < LoadNodes.size(); ++i) {
18585 // All loads must share the same chain.
18586 if (LoadNodes[i].MemNode->getChain() != LoadChain)
18589 int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
18590 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
18592 LastConsecutiveLoad = i;
18594 if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
18595 isDereferenceable = false;
18597 // Find a legal type for the vector store.
18598 unsigned Elts = (i + 1) * NumMemElts;
18599 EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
18601 // Break early when size is too large to be legal.
18602 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
18605 bool IsFastSt = false;
18606 bool IsFastLd = false;
18607 // Don't try vector types if we need a rotate. We may still fail the
18608 // legality checks for the integer type, but we can't handle the rotate
18609 // case with vectors.
18610 // FIXME: We could use a shuffle in place of the rotate.
18611 if (!NeedRotate && TLI.isTypeLegal(StoreTy) &&
18612 TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
18613 DAG.getMachineFunction()) &&
18614 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18615 *FirstInChain->getMemOperand(), &IsFastSt) &&
18617 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18618 *FirstLoad->getMemOperand(), &IsFastLd) &&
18620 LastLegalVectorType = i + 1;
18623 // Find a legal type for the integer store.
18624 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
18625 StoreTy = EVT::getIntegerVT(Context, SizeInBits);
18626 if (TLI.isTypeLegal(StoreTy) &&
18627 TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
18628 DAG.getMachineFunction()) &&
18629 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18630 *FirstInChain->getMemOperand(), &IsFastSt) &&
18632 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18633 *FirstLoad->getMemOperand(), &IsFastLd) &&
18635 LastLegalIntegerType = i + 1;
18636 DoIntegerTruncate = false;
18637 // Or check whether a truncstore and extload is legal.
18638 } else if (TLI.getTypeAction(Context, StoreTy) ==
18639 TargetLowering::TypePromoteInteger) {
18640 EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
18641 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
18642 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
18643 DAG.getMachineFunction()) &&
18644 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&
18645 TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&
18646 TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
18647 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18648 *FirstInChain->getMemOperand(), &IsFastSt) &&
18650 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18651 *FirstLoad->getMemOperand(), &IsFastLd) &&
18653 LastLegalIntegerType = i + 1;
18654 DoIntegerTruncate = true;
18659 // Only use vector types if the vector type is larger than the integer
18660 // type. If they are the same, use integers.
18662 LastLegalVectorType > LastLegalIntegerType && AllowVectors;
18663 unsigned LastLegalType =
18664 std::max(LastLegalVectorType, LastLegalIntegerType);
18666 // We add +1 here because the LastXXX variables refer to location while
18667 // the NumElem refers to array/index size.
18668 unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
18669 NumElem = std::min(LastLegalType, NumElem);
18670 Align FirstLoadAlign = FirstLoad->getAlign();
18673 // We know that candidate stores are in order and of correct
18674 // shape. While there is no mergeable sequence from the
18675 // beginning one may start later in the sequence. The only
18676 // reason a merge of size N could have failed where another of
18677 // the same size would not have is if the alignment or either
18678 // the load or store has improved. Drop as many candidates as we
18680 unsigned NumSkip = 1;
18681 while ((NumSkip < LoadNodes.size()) &&
18682 (LoadNodes[NumSkip].MemNode->getAlign() <= FirstLoadAlign) &&
18683 (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
18685 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
18686 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
18687 NumConsecutiveStores -= NumSkip;
18691 // Check that we can merge these candidates without causing a cycle.
18692 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
18694 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18695 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
18696 NumConsecutiveStores -= NumElem;
18700 // Find if it is better to use vectors or integers to load and store
18704 // Find a legal type for the vector store.
18705 unsigned Elts = NumElem * NumMemElts;
18706 JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
18708 unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
18709 JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
18712 SDLoc LoadDL(LoadNodes[0].MemNode);
18713 SDLoc StoreDL(StoreNodes[0].MemNode);
18715 // The merged loads are required to have the same incoming chain, so
18716 // using the first's chain is acceptable.
18718 SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
18719 AddToWorklist(NewStoreChain.getNode());
18721 MachineMemOperand::Flags LdMMOFlags =
18722 isDereferenceable ? MachineMemOperand::MODereferenceable
18723 : MachineMemOperand::MONone;
18724 if (IsNonTemporalLoad)
18725 LdMMOFlags |= MachineMemOperand::MONonTemporal;
18727 MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
18728 ? MachineMemOperand::MONonTemporal
18729 : MachineMemOperand::MONone;
18731 SDValue NewLoad, NewStore;
18732 if (UseVectorTy || !DoIntegerTruncate) {
18733 NewLoad = DAG.getLoad(
18734 JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
18735 FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags);
18736 SDValue StoreOp = NewLoad;
18738 unsigned LoadWidth = ElementSizeBytes * 8 * 2;
18739 assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) &&
18740 "Unexpected type for rotate-able load pair");
18742 DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL);
18743 // Target can convert to the identical ROTR if it does not have ROTL.
18744 StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt);
18746 NewStore = DAG.getStore(
18747 NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(),
18748 FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);
18749 } else { // This must be the truncstore/extload case
18751 TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
18752 NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
18753 FirstLoad->getChain(), FirstLoad->getBasePtr(),
18754 FirstLoad->getPointerInfo(), JointMemOpVT,
18755 FirstLoadAlign, LdMMOFlags);
18756 NewStore = DAG.getTruncStore(
18757 NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
18758 FirstInChain->getPointerInfo(), JointMemOpVT,
18759 FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags());
18762 // Transfer chain users from old loads to the new load.
18763 for (unsigned i = 0; i < NumElem; ++i) {
18764 LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
18765 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
18766 SDValue(NewLoad.getNode(), 1));
18769 // Replace all stores with the new store. Recursively remove corresponding
18770 // values if they are no longer used.
18771 for (unsigned i = 0; i < NumElem; ++i) {
18772 SDValue Val = StoreNodes[i].MemNode->getOperand(1);
18773 CombineTo(StoreNodes[i].MemNode, NewStore);
18774 if (Val->use_empty())
18775 recursivelyDeleteUnusedNodes(Val.getNode());
18779 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18780 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
18781 NumConsecutiveStores -= NumElem;
18786 bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
18787 if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
18790 // TODO: Extend this function to merge stores of scalable vectors.
18791 // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8>
18792 // store since we know <vscale x 16 x i8> is exactly twice as large as
18793 // <vscale x 8 x i8>). Until then, bail out for scalable vectors.
18794 EVT MemVT = St->getMemoryVT();
18795 if (MemVT.isScalableVector())
18797 if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
18800 // This function cannot currently deal with non-byte-sized memory sizes.
18801 int64_t ElementSizeBytes = MemVT.getStoreSize();
18802 if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())
18805 // Do not bother looking at stored values that are not constants, loads, or
18806 // extracted vector elements.
18807 SDValue StoredVal = peekThroughBitcasts(St->getValue());
18808 const StoreSource StoreSrc = getStoreSource(StoredVal);
18809 if (StoreSrc == StoreSource::Unknown)
18812 SmallVector<MemOpLink, 8> StoreNodes;
18814 // Find potential store merge candidates by searching through chain sub-DAG
18815 getStoreMergeCandidates(St, StoreNodes, RootNode);
18817 // Check if there is anything to merge.
18818 if (StoreNodes.size() < 2)
18821 // Sort the memory operands according to their distance from the
18823 llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {
18824 return LHS.OffsetFromBase < RHS.OffsetFromBase;
18827 bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(
18828 Attribute::NoImplicitFloat);
18829 bool IsNonTemporalStore = St->isNonTemporal();
18830 bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&
18831 cast<LoadSDNode>(StoredVal)->isNonTemporal();
18833 // Store Merge attempts to merge the lowest stores. This generally
18834 // works out as if successful, as the remaining stores are checked
18835 // after the first collection of stores is merged. However, in the
18836 // case that a non-mergeable store is found first, e.g., {p[-2],
18837 // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
18838 // mergeable cases. To prevent this, we prune such stores from the
18839 // front of StoreNodes here.
18840 bool MadeChange = false;
18841 while (StoreNodes.size() > 1) {
18842 unsigned NumConsecutiveStores =
18843 getConsecutiveStores(StoreNodes, ElementSizeBytes);
18844 // There are no more stores in the list to examine.
18845 if (NumConsecutiveStores == 0)
18848 // We have at least 2 consecutive stores. Try to merge them.
18849 assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");
18850 switch (StoreSrc) {
18851 case StoreSource::Constant:
18852 MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,
18853 MemVT, RootNode, AllowVectors);
18856 case StoreSource::Extract:
18857 MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,
18861 case StoreSource::Load:
18862 MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,
18863 MemVT, RootNode, AllowVectors,
18864 IsNonTemporalStore, IsNonTemporalLoad);
18868 llvm_unreachable("Unhandled store source type");
18874 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
18878 // Replace the chain to avoid dependency.
18879 if (ST->isTruncatingStore()) {
18880 ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
18881 ST->getBasePtr(), ST->getMemoryVT(),
18882 ST->getMemOperand());
18884 ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
18885 ST->getMemOperand());
18888 // Create token to keep both nodes around.
18889 SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
18890 MVT::Other, ST->getChain(), ReplStore);
18892 // Make sure the new and old chains are cleaned up.
18893 AddToWorklist(Token.getNode());
18895 // Don't add users to work list.
18896 return CombineTo(ST, Token, false);
18899 SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
18900 SDValue Value = ST->getValue();
18901 if (Value.getOpcode() == ISD::TargetConstantFP)
18904 if (!ISD::isNormalStore(ST))
18909 SDValue Chain = ST->getChain();
18910 SDValue Ptr = ST->getBasePtr();
18912 const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);
18914 // NOTE: If the original store is volatile, this transform must not increase
18915 // the number of stores. For example, on x86-32 an f64 can be stored in one
18916 // processor operation but an i64 (which is not legal) requires two. So the
18917 // transform should not be done in this case.
18920 switch (CFP->getSimpleValueType(0).SimpleTy) {
18922 llvm_unreachable("Unknown FP type");
18923 case MVT::f16: // We don't do this for these yet.
18930 if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
18931 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
18932 Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
18933 bitcastToAPInt().getZExtValue(), SDLoc(CFP),
18935 return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
18940 if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
18942 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
18943 Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
18944 getZExtValue(), SDLoc(CFP), MVT::i64);
18945 return DAG.getStore(Chain, DL, Tmp,
18946 Ptr, ST->getMemOperand());
18949 if (ST->isSimple() &&
18950 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
18951 // Many FP stores are not made apparent until after legalize, e.g. for
18952 // argument passing. Since this is so common, custom legalize the
18953 // 64-bit integer store into two 32-bit stores.
18954 uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
18955 SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
18956 SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
18957 if (DAG.getDataLayout().isBigEndian())
18960 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
18961 AAMDNodes AAInfo = ST->getAAInfo();
18963 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
18964 ST->getOriginalAlign(), MMOFlags, AAInfo);
18965 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), DL);
18966 SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
18967 ST->getPointerInfo().getWithOffset(4),
18968 ST->getOriginalAlign(), MMOFlags, AAInfo);
18969 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
18977 SDValue DAGCombiner::visitSTORE(SDNode *N) {
18978 StoreSDNode *ST = cast<StoreSDNode>(N);
18979 SDValue Chain = ST->getChain();
18980 SDValue Value = ST->getValue();
18981 SDValue Ptr = ST->getBasePtr();
18983 // If this is a store of a bit convert, store the input value if the
18984 // resultant store does not need a higher alignment than the original.
18985 if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
18986 ST->isUnindexed()) {
18987 EVT SVT = Value.getOperand(0).getValueType();
18988 // If the store is volatile, we only want to change the store type if the
18989 // resulting store is legal. Otherwise we might increase the number of
18990 // memory accesses. We don't care if the original type was legal or not
18991 // as we assume software couldn't rely on the number of accesses of an
18993 // TODO: May be able to relax for unordered atomics (see D66309)
18994 if (((!LegalOperations && ST->isSimple()) ||
18995 TLI.isOperationLegal(ISD::STORE, SVT)) &&
18996 TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
18997 DAG, *ST->getMemOperand())) {
18998 return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
18999 ST->getMemOperand());
19003 // Turn 'store undef, Ptr' -> nothing.
19004 if (Value.isUndef() && ST->isUnindexed())
19007 // Try to infer better alignment information than the store already has.
19008 if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {
19009 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
19010 if (*Alignment > ST->getAlign() &&
19011 isAligned(*Alignment, ST->getSrcValueOffset())) {
19013 DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
19014 ST->getMemoryVT(), *Alignment,
19015 ST->getMemOperand()->getFlags(), ST->getAAInfo());
19016 // NewStore will always be N as we are only refining the alignment
19017 assert(NewStore.getNode() == N);
19023 // Try transforming a pair floating point load / store ops to integer
19024 // load / store ops.
19025 if (SDValue NewST = TransformFPLoadStorePair(N))
19028 // Try transforming several stores into STORE (BSWAP).
19029 if (SDValue Store = mergeTruncStores(ST))
19032 if (ST->isUnindexed()) {
19033 // Walk up chain skipping non-aliasing memory nodes, on this store and any
19034 // adjacent stores.
19035 if (findBetterNeighborChains(ST)) {
19036 // replaceStoreChain uses CombineTo, which handled all of the worklist
19037 // manipulation. Return the original node to not do anything else.
19038 return SDValue(ST, 0);
19040 Chain = ST->getChain();
19043 // FIXME: is there such a thing as a truncating indexed store?
19044 if (ST->isTruncatingStore() && ST->isUnindexed() &&
19045 Value.getValueType().isInteger() &&
19046 (!isa<ConstantSDNode>(Value) ||
19047 !cast<ConstantSDNode>(Value)->isOpaque())) {
19048 // Convert a truncating store of a extension into a standard store.
19049 if ((Value.getOpcode() == ISD::ZERO_EXTEND ||
19050 Value.getOpcode() == ISD::SIGN_EXTEND ||
19051 Value.getOpcode() == ISD::ANY_EXTEND) &&
19052 Value.getOperand(0).getValueType() == ST->getMemoryVT() &&
19053 TLI.isOperationLegalOrCustom(ISD::STORE, ST->getMemoryVT()))
19054 return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
19055 ST->getMemOperand());
19057 APInt TruncDemandedBits =
19058 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
19059 ST->getMemoryVT().getScalarSizeInBits());
19061 // See if we can simplify the operation with SimplifyDemandedBits, which
19062 // only works if the value has a single use.
19063 AddToWorklist(Value.getNode());
19064 if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
19065 // Re-visit the store if anything changed and the store hasn't been merged
19066 // with another node (N is deleted) SimplifyDemandedBits will add Value's
19067 // node back to the worklist if necessary, but we also need to re-visit
19068 // the Store node itself.
19069 if (N->getOpcode() != ISD::DELETED_NODE)
19071 return SDValue(N, 0);
19074 // Otherwise, see if we can simplify the input to this truncstore with
19075 // knowledge that only the low bits are being used. For example:
19076 // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
19077 if (SDValue Shorter =
19078 TLI.SimplifyMultipleUseDemandedBits(Value, TruncDemandedBits, DAG))
19079 return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
19080 ST->getMemOperand());
19082 // If we're storing a truncated constant, see if we can simplify it.
19083 // TODO: Move this to targetShrinkDemandedConstant?
19084 if (auto *Cst = dyn_cast<ConstantSDNode>(Value))
19085 if (!Cst->isOpaque()) {
19086 const APInt &CValue = Cst->getAPIntValue();
19087 APInt NewVal = CValue & TruncDemandedBits;
19088 if (NewVal != CValue) {
19090 DAG.getConstant(NewVal, SDLoc(N), Value.getValueType());
19091 return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr,
19092 ST->getMemoryVT(), ST->getMemOperand());
19097 // If this is a load followed by a store to the same location, then the store
19099 // TODO: Can relax for unordered atomics (see D66309)
19100 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
19101 if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
19102 ST->isUnindexed() && ST->isSimple() &&
19103 Ld->getAddressSpace() == ST->getAddressSpace() &&
19104 // There can't be any side effects between the load and store, such as
19105 // a call or store.
19106 Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
19107 // The store is dead, remove it.
19112 // TODO: Can relax for unordered atomics (see D66309)
19113 if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
19114 if (ST->isUnindexed() && ST->isSimple() &&
19115 ST1->isUnindexed() && ST1->isSimple()) {
19116 if (OptLevel != CodeGenOpt::None && ST1->getBasePtr() == Ptr &&
19117 ST1->getValue() == Value && ST->getMemoryVT() == ST1->getMemoryVT() &&
19118 ST->getAddressSpace() == ST1->getAddressSpace()) {
19119 // If this is a store followed by a store with the same value to the
19120 // same location, then the store is dead/noop.
19124 if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
19125 !ST1->getBasePtr().isUndef() &&
19126 // BaseIndexOffset and the code below requires knowing the size
19127 // of a vector, so bail out if MemoryVT is scalable.
19128 !ST->getMemoryVT().isScalableVector() &&
19129 !ST1->getMemoryVT().isScalableVector() &&
19130 ST->getAddressSpace() == ST1->getAddressSpace()) {
19131 const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
19132 const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
19133 unsigned STBitSize = ST->getMemoryVT().getFixedSizeInBits();
19134 unsigned ChainBitSize = ST1->getMemoryVT().getFixedSizeInBits();
19135 // If this is a store who's preceding store to a subset of the current
19136 // location and no one other node is chained to that store we can
19137 // effectively drop the store. Do not remove stores to undef as they may
19138 // be used as data sinks.
19139 if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) {
19140 CombineTo(ST1, ST1->getChain());
19147 // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
19148 // truncating store. We can do this even if this is already a truncstore.
19149 if ((Value.getOpcode() == ISD::FP_ROUND ||
19150 Value.getOpcode() == ISD::TRUNCATE) &&
19151 Value->hasOneUse() && ST->isUnindexed() &&
19152 TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
19153 ST->getMemoryVT(), LegalOperations)) {
19154 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
19155 Ptr, ST->getMemoryVT(), ST->getMemOperand());
19158 // Always perform this optimization before types are legal. If the target
19159 // prefers, also try this after legalization to catch stores that were created
19160 // by intrinsics or other nodes.
19161 if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) {
19163 // There can be multiple store sequences on the same chain.
19164 // Keep trying to merge store sequences until we are unable to do so
19165 // or until we merge the last store on the chain.
19166 bool Changed = mergeConsecutiveStores(ST);
19167 if (!Changed) break;
19168 // Return N as merge only uses CombineTo and no worklist clean
19169 // up is necessary.
19170 if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
19171 return SDValue(N, 0);
19175 // Try transforming N to an indexed store.
19176 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
19177 return SDValue(N, 0);
19179 // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
19181 // Make sure to do this only after attempting to merge stores in order to
19182 // avoid changing the types of some subset of stores due to visit order,
19183 // preventing their merging.
19184 if (isa<ConstantFPSDNode>(ST->getValue())) {
19185 if (SDValue NewSt = replaceStoreOfFPConstant(ST))
19189 if (SDValue NewSt = splitMergedValStore(ST))
19192 return ReduceLoadOpStoreWidth(N);
19195 SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
19196 const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
19197 if (!LifetimeEnd->hasOffset())
19200 const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
19201 LifetimeEnd->getOffset(), false);
19203 // We walk up the chains to find stores.
19204 SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
19205 while (!Chains.empty()) {
19206 SDValue Chain = Chains.pop_back_val();
19207 if (!Chain.hasOneUse())
19209 switch (Chain.getOpcode()) {
19210 case ISD::TokenFactor:
19211 for (unsigned Nops = Chain.getNumOperands(); Nops;)
19212 Chains.push_back(Chain.getOperand(--Nops));
19214 case ISD::LIFETIME_START:
19215 case ISD::LIFETIME_END:
19216 // We can forward past any lifetime start/end that can be proven not to
19218 if (!mayAlias(Chain.getNode(), N))
19219 Chains.push_back(Chain.getOperand(0));
19222 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
19223 // TODO: Can relax for unordered atomics (see D66309)
19224 if (!ST->isSimple() || ST->isIndexed())
19226 const TypeSize StoreSize = ST->getMemoryVT().getStoreSize();
19227 // The bounds of a scalable store are not known until runtime, so this
19228 // store cannot be elided.
19229 if (StoreSize.isScalable())
19231 const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
19232 // If we store purely within object bounds just before its lifetime ends,
19233 // we can remove the store.
19234 if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
19235 StoreSize.getFixedSize() * 8)) {
19236 LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
19237 dbgs() << "\nwithin LIFETIME_END of : ";
19238 LifetimeEndBase.dump(); dbgs() << "\n");
19239 CombineTo(ST, ST->getChain());
19240 return SDValue(N, 0);
19248 /// For the instruction sequence of store below, F and I values
19249 /// are bundled together as an i64 value before being stored into memory.
19250 /// Sometimes it is more efficent to generate separate stores for F and I,
19251 /// which can remove the bitwise instructions or sink them to colder places.
19253 /// (store (or (zext (bitcast F to i32) to i64),
19254 /// (shl (zext I to i64), 32)), addr) -->
19255 /// (store F, addr) and (store I, addr+4)
19257 /// Similarly, splitting for other merged store can also be beneficial, like:
19258 /// For pair of {i32, i32}, i64 store --> two i32 stores.
19259 /// For pair of {i32, i16}, i64 store --> two i32 stores.
19260 /// For pair of {i16, i16}, i32 store --> two i16 stores.
19261 /// For pair of {i16, i8}, i32 store --> two i16 stores.
19262 /// For pair of {i8, i8}, i16 store --> two i8 stores.
19264 /// We allow each target to determine specifically which kind of splitting is
19267 /// The store patterns are commonly seen from the simple code snippet below
19268 /// if only std::make_pair(...) is sroa transformed before inlined into hoo.
19269 /// void goo(const std::pair<int, float> &);
19272 /// goo(std::make_pair(tmp, ftmp));
19276 SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
19277 if (OptLevel == CodeGenOpt::None)
19280 // Can't change the number of memory accesses for a volatile store or break
19281 // atomicity for an atomic one.
19282 if (!ST->isSimple())
19285 SDValue Val = ST->getValue();
19288 // Match OR operand.
19289 if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
19292 // Match SHL operand and get Lower and Higher parts of Val.
19293 SDValue Op1 = Val.getOperand(0);
19294 SDValue Op2 = Val.getOperand(1);
19296 if (Op1.getOpcode() != ISD::SHL) {
19297 std::swap(Op1, Op2);
19298 if (Op1.getOpcode() != ISD::SHL)
19302 Hi = Op1.getOperand(0);
19303 if (!Op1.hasOneUse())
19306 // Match shift amount to HalfValBitSize.
19307 unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
19308 ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
19309 if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
19312 // Lo and Hi are zero-extended from int with size less equal than 32
19314 if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
19315 !Lo.getOperand(0).getValueType().isScalarInteger() ||
19316 Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize ||
19317 Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
19318 !Hi.getOperand(0).getValueType().isScalarInteger() ||
19319 Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
19322 // Use the EVT of low and high parts before bitcast as the input
19323 // of target query.
19324 EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
19325 ? Lo.getOperand(0).getValueType()
19326 : Lo.getValueType();
19327 EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
19328 ? Hi.getOperand(0).getValueType()
19329 : Hi.getValueType();
19330 if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
19333 // Start to split store.
19334 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
19335 AAMDNodes AAInfo = ST->getAAInfo();
19337 // Change the sizes of Lo and Hi's value types to HalfValBitSize.
19338 EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
19339 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
19340 Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
19342 SDValue Chain = ST->getChain();
19343 SDValue Ptr = ST->getBasePtr();
19344 // Lower value store.
19345 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
19346 ST->getOriginalAlign(), MMOFlags, AAInfo);
19347 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(HalfValBitSize / 8), DL);
19348 // Higher value store.
19349 SDValue St1 = DAG.getStore(
19350 St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
19351 ST->getOriginalAlign(), MMOFlags, AAInfo);
19355 /// Convert a disguised subvector insertion into a shuffle:
19356 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
19357 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
19358 "Expected extract_vector_elt");
19359 SDValue InsertVal = N->getOperand(1);
19360 SDValue Vec = N->getOperand(0);
19362 // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N),
19364 // --> (vector_shuffle X, Y) and variations where shuffle operands may be
19366 if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() &&
19367 InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19368 isa<ConstantSDNode>(InsertVal.getOperand(1))) {
19369 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode());
19370 ArrayRef<int> Mask = SVN->getMask();
19372 SDValue X = Vec.getOperand(0);
19373 SDValue Y = Vec.getOperand(1);
19375 // Vec's operand 0 is using indices from 0 to N-1 and
19376 // operand 1 from N to 2N - 1, where N is the number of
19377 // elements in the vectors.
19378 SDValue InsertVal0 = InsertVal.getOperand(0);
19379 int ElementOffset = -1;
19381 // We explore the inputs of the shuffle in order to see if we find the
19382 // source of the extract_vector_elt. If so, we can use it to modify the
19383 // shuffle rather than perform an insert_vector_elt.
19384 SmallVector<std::pair<int, SDValue>, 8> ArgWorkList;
19385 ArgWorkList.emplace_back(Mask.size(), Y);
19386 ArgWorkList.emplace_back(0, X);
19388 while (!ArgWorkList.empty()) {
19391 std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val();
19393 if (ArgVal == InsertVal0) {
19394 ElementOffset = ArgOffset;
19398 // Peek through concat_vector.
19399 if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) {
19400 int CurrentArgOffset =
19401 ArgOffset + ArgVal.getValueType().getVectorNumElements();
19402 int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements();
19403 for (SDValue Op : reverse(ArgVal->ops())) {
19404 CurrentArgOffset -= Step;
19405 ArgWorkList.emplace_back(CurrentArgOffset, Op);
19408 // Make sure we went through all the elements and did not screw up index
19410 assert(CurrentArgOffset == ArgOffset);
19414 // If we failed to find a match, see if we can replace an UNDEF shuffle
19416 if (ElementOffset == -1 && Y.isUndef() &&
19417 InsertVal0.getValueType() == Y.getValueType()) {
19418 ElementOffset = Mask.size();
19422 if (ElementOffset != -1) {
19423 SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
19425 auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1));
19426 NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue();
19427 assert(NewMask[InsIndex] <
19428 (int)(2 * Vec.getValueType().getVectorNumElements()) &&
19429 NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound");
19431 SDValue LegalShuffle =
19432 TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X,
19435 return LegalShuffle;
19439 // insert_vector_elt V, (bitcast X from vector type), IdxC -->
19440 // bitcast(shuffle (bitcast V), (extended X), Mask)
19441 // Note: We do not use an insert_subvector node because that requires a
19442 // legal subvector type.
19443 if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
19444 !InsertVal.getOperand(0).getValueType().isVector())
19447 SDValue SubVec = InsertVal.getOperand(0);
19448 SDValue DestVec = N->getOperand(0);
19449 EVT SubVecVT = SubVec.getValueType();
19450 EVT VT = DestVec.getValueType();
19451 unsigned NumSrcElts = SubVecVT.getVectorNumElements();
19452 // If the source only has a single vector element, the cost of creating adding
19453 // it to a vector is likely to exceed the cost of a insert_vector_elt.
19454 if (NumSrcElts == 1)
19456 unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
19457 unsigned NumMaskVals = ExtendRatio * NumSrcElts;
19459 // Step 1: Create a shuffle mask that implements this insert operation. The
19460 // vector that we are inserting into will be operand 0 of the shuffle, so
19461 // those elements are just 'i'. The inserted subvector is in the first
19462 // positions of operand 1 of the shuffle. Example:
19463 // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
19464 SmallVector<int, 16> Mask(NumMaskVals);
19465 for (unsigned i = 0; i != NumMaskVals; ++i) {
19466 if (i / NumSrcElts == InsIndex)
19467 Mask[i] = (i % NumSrcElts) + NumMaskVals;
19472 // Bail out if the target can not handle the shuffle we want to create.
19473 EVT SubVecEltVT = SubVecVT.getVectorElementType();
19474 EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
19475 if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
19478 // Step 2: Create a wide vector from the inserted source vector by appending
19479 // undefined elements. This is the same size as our destination vector.
19481 SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
19482 ConcatOps[0] = SubVec;
19483 SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
19485 // Step 3: Shuffle in the padded subvector.
19486 SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
19487 SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
19488 AddToWorklist(PaddedSubV.getNode());
19489 AddToWorklist(DestVecBC.getNode());
19490 AddToWorklist(Shuf.getNode());
19491 return DAG.getBitcast(VT, Shuf);
19494 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
19495 SDValue InVec = N->getOperand(0);
19496 SDValue InVal = N->getOperand(1);
19497 SDValue EltNo = N->getOperand(2);
19500 EVT VT = InVec.getValueType();
19501 auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
19503 // Insert into out-of-bounds element is undefined.
19504 if (IndexC && VT.isFixedLengthVector() &&
19505 IndexC->getZExtValue() >= VT.getVectorNumElements())
19506 return DAG.getUNDEF(VT);
19508 // Remove redundant insertions:
19509 // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
19510 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19511 InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
19515 // If this is variable insert to undef vector, it might be better to splat:
19516 // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
19517 if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
19518 if (VT.isScalableVector())
19519 return DAG.getSplatVector(VT, DL, InVal);
19521 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
19522 return DAG.getBuildVector(VT, DL, Ops);
19527 if (VT.isScalableVector())
19530 unsigned NumElts = VT.getVectorNumElements();
19532 // We must know which element is being inserted for folds below here.
19533 unsigned Elt = IndexC->getZExtValue();
19535 if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
19538 // Handle <1 x ???> vector insertion special cases.
19539 if (NumElts == 1) {
19540 // insert_vector_elt(x, extract_vector_elt(y, 0), 0) -> y
19541 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19542 InVal.getOperand(0).getValueType() == VT &&
19543 isNullConstant(InVal.getOperand(1)))
19544 return InVal.getOperand(0);
19547 // Canonicalize insert_vector_elt dag nodes.
19549 // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
19550 // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
19552 // Do this only if the child insert_vector node has one use; also
19553 // do this only if indices are both constants and Idx1 < Idx0.
19554 if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
19555 && isa<ConstantSDNode>(InVec.getOperand(2))) {
19556 unsigned OtherElt = InVec.getConstantOperandVal(2);
19557 if (Elt < OtherElt) {
19559 SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
19560 InVec.getOperand(0), InVal, EltNo);
19561 AddToWorklist(NewOp.getNode());
19562 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
19563 VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
19567 // Attempt to convert an insert_vector_elt chain into a legal build_vector.
19568 if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
19569 // vXi1 vector - we don't need to recurse.
19571 return DAG.getBuildVector(VT, DL, {InVal});
19573 // If we haven't already collected the element, insert into the op list.
19574 EVT MaxEltVT = InVal.getValueType();
19575 auto AddBuildVectorOp = [&](SmallVectorImpl<SDValue> &Ops, SDValue Elt,
19579 if (VT.isInteger()) {
19580 EVT EltVT = Elt.getValueType();
19581 MaxEltVT = MaxEltVT.bitsGE(EltVT) ? MaxEltVT : EltVT;
19586 // Ensure all the operands are the same value type, fill any missing
19587 // operands with UNDEF and create the BUILD_VECTOR.
19588 auto CanonicalizeBuildVector = [&](SmallVectorImpl<SDValue> &Ops) {
19589 assert(Ops.size() == NumElts && "Unexpected vector size");
19590 for (SDValue &Op : Ops) {
19592 Op = VT.isInteger() ? DAG.getAnyExtOrTrunc(Op, DL, MaxEltVT) : Op;
19594 Op = DAG.getUNDEF(MaxEltVT);
19596 return DAG.getBuildVector(VT, DL, Ops);
19599 SmallVector<SDValue, 8> Ops(NumElts, SDValue());
19602 // Recurse up a INSERT_VECTOR_ELT chain to build a BUILD_VECTOR.
19603 for (SDValue CurVec = InVec; CurVec;) {
19604 // UNDEF - build new BUILD_VECTOR from already inserted operands.
19605 if (CurVec.isUndef())
19606 return CanonicalizeBuildVector(Ops);
19608 // BUILD_VECTOR - insert unused operands and build new BUILD_VECTOR.
19609 if (CurVec.getOpcode() == ISD::BUILD_VECTOR && CurVec.hasOneUse()) {
19610 for (unsigned I = 0; I != NumElts; ++I)
19611 AddBuildVectorOp(Ops, CurVec.getOperand(I), I);
19612 return CanonicalizeBuildVector(Ops);
19615 // SCALAR_TO_VECTOR - insert unused scalar and build new BUILD_VECTOR.
19616 if (CurVec.getOpcode() == ISD::SCALAR_TO_VECTOR && CurVec.hasOneUse()) {
19617 AddBuildVectorOp(Ops, CurVec.getOperand(0), 0);
19618 return CanonicalizeBuildVector(Ops);
19621 // INSERT_VECTOR_ELT - insert operand and continue up the chain.
19622 if (CurVec.getOpcode() == ISD::INSERT_VECTOR_ELT && CurVec.hasOneUse())
19623 if (auto *CurIdx = dyn_cast<ConstantSDNode>(CurVec.getOperand(2)))
19624 if (CurIdx->getAPIntValue().ult(NumElts)) {
19625 unsigned Idx = CurIdx->getZExtValue();
19626 AddBuildVectorOp(Ops, CurVec.getOperand(1), Idx);
19628 // Found entire BUILD_VECTOR.
19629 if (all_of(Ops, [](SDValue Op) { return !!Op; }))
19630 return CanonicalizeBuildVector(Ops);
19632 CurVec = CurVec->getOperand(0);
19636 // Failed to find a match in the chain - bail.
19640 // See if we can fill in the missing constant elements as zeros.
19641 // TODO: Should we do this for any constant?
19642 APInt DemandedZeroElts = APInt::getZero(NumElts);
19643 for (unsigned I = 0; I != NumElts; ++I)
19645 DemandedZeroElts.setBit(I);
19647 if (DAG.MaskedVectorIsZero(InVec, DemandedZeroElts)) {
19648 SDValue Zero = VT.isInteger() ? DAG.getConstant(0, DL, MaxEltVT)
19649 : DAG.getConstantFP(0, DL, MaxEltVT);
19650 for (unsigned I = 0; I != NumElts; ++I)
19654 return CanonicalizeBuildVector(Ops);
19661 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
19663 LoadSDNode *OriginalLoad) {
19664 assert(OriginalLoad->isSimple());
19666 EVT ResultVT = EVE->getValueType(0);
19667 EVT VecEltVT = InVecVT.getVectorElementType();
19669 // If the vector element type is not a multiple of a byte then we are unable
19670 // to correctly compute an address to load only the extracted element as a
19672 if (!VecEltVT.isByteSized())
19675 ISD::LoadExtType ExtTy =
19676 ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
19677 if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
19678 !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
19681 Align Alignment = OriginalLoad->getAlign();
19682 MachinePointerInfo MPI;
19684 if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
19685 int Elt = ConstEltNo->getZExtValue();
19686 unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
19687 MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
19688 Alignment = commonAlignment(Alignment, PtrOff);
19690 // Discard the pointer info except the address space because the memory
19691 // operand can't represent this new access since the offset is variable.
19692 MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
19693 Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
19696 bool IsFast = false;
19697 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
19698 OriginalLoad->getAddressSpace(), Alignment,
19699 OriginalLoad->getMemOperand()->getFlags(),
19704 SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
19707 // We are replacing a vector load with a scalar load. The new load must have
19708 // identical memory op ordering to the original.
19710 if (ResultVT.bitsGT(VecEltVT)) {
19711 // If the result type of vextract is wider than the load, then issue an
19712 // extending load instead.
19713 ISD::LoadExtType ExtType =
19714 TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD
19716 Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(),
19717 NewPtr, MPI, VecEltVT, Alignment,
19718 OriginalLoad->getMemOperand()->getFlags(),
19719 OriginalLoad->getAAInfo());
19720 DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
19722 // The result type is narrower or the same width as the vector element
19723 Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI,
19724 Alignment, OriginalLoad->getMemOperand()->getFlags(),
19725 OriginalLoad->getAAInfo());
19726 DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
19727 if (ResultVT.bitsLT(VecEltVT))
19728 Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load);
19730 Load = DAG.getBitcast(ResultVT, Load);
19736 /// Transform a vector binary operation into a scalar binary operation by moving
19737 /// the math/logic after an extract element of a vector.
19738 static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
19739 bool LegalOperations) {
19740 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19741 SDValue Vec = ExtElt->getOperand(0);
19742 SDValue Index = ExtElt->getOperand(1);
19743 auto *IndexC = dyn_cast<ConstantSDNode>(Index);
19744 if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
19745 Vec->getNumValues() != 1)
19748 // Targets may want to avoid this to prevent an expensive register transfer.
19749 if (!TLI.shouldScalarizeBinop(Vec))
19752 // Extracting an element of a vector constant is constant-folded, so this
19753 // transform is just replacing a vector op with a scalar op while moving the
19755 SDValue Op0 = Vec.getOperand(0);
19756 SDValue Op1 = Vec.getOperand(1);
19758 if (isAnyConstantBuildVector(Op0, true) ||
19759 ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
19760 isAnyConstantBuildVector(Op1, true) ||
19761 ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
19762 // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
19763 // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
19765 EVT VT = ExtElt->getValueType(0);
19766 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
19767 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
19768 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
19774 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
19775 SDValue VecOp = N->getOperand(0);
19776 SDValue Index = N->getOperand(1);
19777 EVT ScalarVT = N->getValueType(0);
19778 EVT VecVT = VecOp.getValueType();
19779 if (VecOp.isUndef())
19780 return DAG.getUNDEF(ScalarVT);
19782 // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
19784 // This only really matters if the index is non-constant since other combines
19785 // on the constant elements already work.
19787 if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT &&
19788 Index == VecOp.getOperand(2)) {
19789 SDValue Elt = VecOp.getOperand(1);
19790 return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt;
19793 // (vextract (scalar_to_vector val, 0) -> val
19794 if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {
19795 // Only 0'th element of SCALAR_TO_VECTOR is defined.
19796 if (DAG.isKnownNeverZero(Index))
19797 return DAG.getUNDEF(ScalarVT);
19799 // Check if the result type doesn't match the inserted element type. A
19800 // SCALAR_TO_VECTOR may truncate the inserted element and the
19801 // EXTRACT_VECTOR_ELT may widen the extracted vector.
19802 SDValue InOp = VecOp.getOperand(0);
19803 if (InOp.getValueType() != ScalarVT) {
19804 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger() &&
19805 InOp.getValueType().bitsGT(ScalarVT));
19806 return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, InOp);
19811 // extract_vector_elt of out-of-bounds element -> UNDEF
19812 auto *IndexC = dyn_cast<ConstantSDNode>(Index);
19813 if (IndexC && VecVT.isFixedLengthVector() &&
19814 IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
19815 return DAG.getUNDEF(ScalarVT);
19817 // extract_vector_elt (build_vector x, y), 1 -> y
19818 if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
19819 VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
19820 TLI.isTypeLegal(VecVT) &&
19821 (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
19822 assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
19823 VecVT.isFixedLengthVector()) &&
19824 "BUILD_VECTOR used for scalable vectors");
19825 unsigned IndexVal =
19826 VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
19827 SDValue Elt = VecOp.getOperand(IndexVal);
19828 EVT InEltVT = Elt.getValueType();
19830 // Sometimes build_vector's scalar input types do not match result type.
19831 if (ScalarVT == InEltVT)
19834 // TODO: It may be useful to truncate if free if the build_vector implicitly
19838 if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations))
19841 if (VecVT.isScalableVector())
19844 // All the code from this point onwards assumes fixed width vectors, but it's
19845 // possible that some of the combinations could be made to work for scalable
19847 unsigned NumElts = VecVT.getVectorNumElements();
19848 unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
19850 // TODO: These transforms should not require the 'hasOneUse' restriction, but
19851 // there are regressions on multiple targets without it. We can end up with a
19852 // mess of scalar and vector code if we reduce only part of the DAG to scalar.
19853 if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() &&
19854 VecOp.hasOneUse()) {
19855 // The vector index of the LSBs of the source depend on the endian-ness.
19856 bool IsLE = DAG.getDataLayout().isLittleEndian();
19857 unsigned ExtractIndex = IndexC->getZExtValue();
19858 // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
19859 unsigned BCTruncElt = IsLE ? 0 : NumElts - 1;
19860 SDValue BCSrc = VecOp.getOperand(0);
19861 if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
19862 return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc);
19864 if (LegalTypes && BCSrc.getValueType().isInteger() &&
19865 BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
19866 // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
19867 // trunc i64 X to i32
19868 SDValue X = BCSrc.getOperand(0);
19869 assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
19870 "Extract element and scalar to vector can't change element type "
19871 "from FP to integer.");
19872 unsigned XBitWidth = X.getValueSizeInBits();
19873 BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
19875 // An extract element return value type can be wider than its vector
19876 // operand element type. In that case, the high bits are undefined, so
19877 // it's possible that we may need to extend rather than truncate.
19878 if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
19879 assert(XBitWidth % VecEltBitWidth == 0 &&
19880 "Scalar bitwidth must be a multiple of vector element bitwidth");
19881 return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
19886 // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
19887 // We only perform this optimization before the op legalization phase because
19888 // we may introduce new vector instructions which are not backed by TD
19889 // patterns. For example on AVX, extracting elements from a wide vector
19890 // without using extract_subvector. However, if we can find an underlying
19891 // scalar value, then we can always use that.
19892 if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) {
19893 auto *Shuf = cast<ShuffleVectorSDNode>(VecOp);
19894 // Find the new index to extract from.
19895 int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue());
19897 // Extracting an undef index is undef.
19899 return DAG.getUNDEF(ScalarVT);
19901 // Select the right vector half to extract from.
19903 if (OrigElt < (int)NumElts) {
19904 SVInVec = VecOp.getOperand(0);
19906 SVInVec = VecOp.getOperand(1);
19907 OrigElt -= NumElts;
19910 if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
19911 SDValue InOp = SVInVec.getOperand(OrigElt);
19912 if (InOp.getValueType() != ScalarVT) {
19913 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
19914 InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
19920 // FIXME: We should handle recursing on other vector shuffles and
19921 // scalar_to_vector here as well.
19923 if (!LegalOperations ||
19924 // FIXME: Should really be just isOperationLegalOrCustom.
19925 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
19926 TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
19927 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
19928 DAG.getVectorIdxConstant(OrigElt, DL));
19932 // If only EXTRACT_VECTOR_ELT nodes use the source vector we can
19933 // simplify it based on the (valid) extraction indices.
19934 if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) {
19935 return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19936 Use->getOperand(0) == VecOp &&
19937 isa<ConstantSDNode>(Use->getOperand(1));
19939 APInt DemandedElts = APInt::getZero(NumElts);
19940 for (SDNode *Use : VecOp->uses()) {
19941 auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
19942 if (CstElt->getAPIntValue().ult(NumElts))
19943 DemandedElts.setBit(CstElt->getZExtValue());
19945 if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) {
19946 // We simplified the vector operand of this extract element. If this
19947 // extract is not dead, visit it again so it is folded properly.
19948 if (N->getOpcode() != ISD::DELETED_NODE)
19950 return SDValue(N, 0);
19952 APInt DemandedBits = APInt::getAllOnes(VecEltBitWidth);
19953 if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
19954 // We simplified the vector operand of this extract element. If this
19955 // extract is not dead, visit it again so it is folded properly.
19956 if (N->getOpcode() != ISD::DELETED_NODE)
19958 return SDValue(N, 0);
19962 // Everything under here is trying to match an extract of a loaded value.
19963 // If the result of load has to be truncated, then it's not necessarily
19965 bool BCNumEltsChanged = false;
19966 EVT ExtVT = VecVT.getVectorElementType();
19968 if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT))
19971 if (VecOp.getOpcode() == ISD::BITCAST) {
19972 // Don't duplicate a load with other uses.
19973 if (!VecOp.hasOneUse())
19976 EVT BCVT = VecOp.getOperand(0).getValueType();
19977 if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
19979 if (NumElts != BCVT.getVectorNumElements())
19980 BCNumEltsChanged = true;
19981 VecOp = VecOp.getOperand(0);
19982 ExtVT = BCVT.getVectorElementType();
19985 // extract (vector load $addr), i --> load $addr + i * size
19986 if (!LegalOperations && !IndexC && VecOp.hasOneUse() &&
19987 ISD::isNormalLoad(VecOp.getNode()) &&
19988 !Index->hasPredecessor(VecOp.getNode())) {
19989 auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
19990 if (VecLoad && VecLoad->isSimple())
19991 return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
19994 // Perform only after legalization to ensure build_vector / vector_shuffle
19995 // optimizations have already been done.
19996 if (!LegalOperations || !IndexC)
19999 // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
20000 // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
20001 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
20002 int Elt = IndexC->getZExtValue();
20003 LoadSDNode *LN0 = nullptr;
20004 if (ISD::isNormalLoad(VecOp.getNode())) {
20005 LN0 = cast<LoadSDNode>(VecOp);
20006 } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
20007 VecOp.getOperand(0).getValueType() == ExtVT &&
20008 ISD::isNormalLoad(VecOp.getOperand(0).getNode())) {
20009 // Don't duplicate a load with other uses.
20010 if (!VecOp.hasOneUse())
20013 LN0 = cast<LoadSDNode>(VecOp.getOperand(0));
20015 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) {
20016 // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
20018 // (load $addr+1*size)
20020 // Don't duplicate a load with other uses.
20021 if (!VecOp.hasOneUse())
20024 // If the bit convert changed the number of elements, it is unsafe
20025 // to examine the mask.
20026 if (BCNumEltsChanged)
20029 // Select the input vector, guarding against out of range extract vector.
20030 int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt);
20031 VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1);
20033 if (VecOp.getOpcode() == ISD::BITCAST) {
20034 // Don't duplicate a load with other uses.
20035 if (!VecOp.hasOneUse())
20038 VecOp = VecOp.getOperand(0);
20040 if (ISD::isNormalLoad(VecOp.getNode())) {
20041 LN0 = cast<LoadSDNode>(VecOp);
20042 Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;
20043 Index = DAG.getConstant(Elt, DL, Index.getValueType());
20045 } else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged &&
20046 VecVT.getVectorElementType() == ScalarVT &&
20049 VecOp.getOperand(0).getValueType().getVectorElementType()))) {
20050 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0
20051 // -> extract_vector_elt a, 0
20052 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1
20053 // -> extract_vector_elt a, 1
20054 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2
20055 // -> extract_vector_elt b, 0
20056 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
20057 // -> extract_vector_elt b, 1
20059 EVT ConcatVT = VecOp.getOperand(0).getValueType();
20060 unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
20061 SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
20062 Index.getValueType());
20064 SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
20065 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
20066 ConcatVT.getVectorElementType(),
20068 return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
20071 // Make sure we found a non-volatile load and the extractelement is
20073 if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple())
20076 // If Idx was -1 above, Elt is going to be -1, so just return undef.
20078 return DAG.getUNDEF(LVT);
20080 return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0);
20083 // Simplify (build_vec (ext )) to (bitcast (build_vec ))
20084 SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
20085 // We perform this optimization post type-legalization because
20086 // the type-legalizer often scalarizes integer-promoted vectors.
20087 // Performing this optimization before may create bit-casts which
20088 // will be type-legalized to complex code sequences.
20089 // We perform this optimization only before the operation legalizer because we
20090 // may introduce illegal operations.
20091 if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
20094 unsigned NumInScalars = N->getNumOperands();
20096 EVT VT = N->getValueType(0);
20098 // Check to see if this is a BUILD_VECTOR of a bunch of values
20099 // which come from any_extend or zero_extend nodes. If so, we can create
20100 // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
20101 // optimizations. We do not handle sign-extend because we can't fill the sign
20103 EVT SourceType = MVT::Other;
20104 bool AllAnyExt = true;
20106 for (unsigned i = 0; i != NumInScalars; ++i) {
20107 SDValue In = N->getOperand(i);
20108 // Ignore undef inputs.
20109 if (In.isUndef()) continue;
20111 bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND;
20112 bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
20114 // Abort if the element is not an extension.
20115 if (!ZeroExt && !AnyExt) {
20116 SourceType = MVT::Other;
20120 // The input is a ZeroExt or AnyExt. Check the original type.
20121 EVT InTy = In.getOperand(0).getValueType();
20123 // Check that all of the widened source types are the same.
20124 if (SourceType == MVT::Other)
20127 else if (InTy != SourceType) {
20128 // Multiple income types. Abort.
20129 SourceType = MVT::Other;
20133 // Check if all of the extends are ANY_EXTENDs.
20134 AllAnyExt &= AnyExt;
20137 // In order to have valid types, all of the inputs must be extended from the
20138 // same source type and all of the inputs must be any or zero extend.
20139 // Scalar sizes must be a power of two.
20140 EVT OutScalarTy = VT.getScalarType();
20141 bool ValidTypes = SourceType != MVT::Other &&
20142 isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
20143 isPowerOf2_32(SourceType.getSizeInBits());
20145 // Create a new simpler BUILD_VECTOR sequence which other optimizations can
20146 // turn into a single shuffle instruction.
20150 // If we already have a splat buildvector, then don't fold it if it means
20151 // introducing zeros.
20152 if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true))
20155 bool isLE = DAG.getDataLayout().isLittleEndian();
20156 unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
20157 assert(ElemRatio > 1 && "Invalid element size ratio");
20158 SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
20159 DAG.getConstant(0, DL, SourceType);
20161 unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
20162 SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
20164 // Populate the new build_vector
20165 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20166 SDValue Cast = N->getOperand(i);
20167 assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
20168 Cast.getOpcode() == ISD::ZERO_EXTEND ||
20169 Cast.isUndef()) && "Invalid cast opcode");
20171 if (Cast.isUndef())
20172 In = DAG.getUNDEF(SourceType);
20174 In = Cast->getOperand(0);
20175 unsigned Index = isLE ? (i * ElemRatio) :
20176 (i * ElemRatio + (ElemRatio - 1));
20178 assert(Index < Ops.size() && "Invalid index");
20182 // The type of the new BUILD_VECTOR node.
20183 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
20184 assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
20185 "Invalid vector size");
20186 // Check if the new vector type is legal.
20187 if (!isTypeLegal(VecVT) ||
20188 (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) &&
20189 TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)))
20192 // Make the new BUILD_VECTOR.
20193 SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);
20195 // The new BUILD_VECTOR node has the potential to be further optimized.
20196 AddToWorklist(BV.getNode());
20197 // Bitcast to the desired type.
20198 return DAG.getBitcast(VT, BV);
20201 // Simplify (build_vec (trunc $1)
20202 // (trunc (srl $1 half-width))
20203 // (trunc (srl $1 (2 * half-width))) …)
20205 SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
20206 assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
20208 // Only for little endian
20209 if (!DAG.getDataLayout().isLittleEndian())
20213 EVT VT = N->getValueType(0);
20214 EVT OutScalarTy = VT.getScalarType();
20215 uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits();
20217 // Only for power of two types to be sure that bitcast works well
20218 if (!isPowerOf2_64(ScalarTypeBitsize))
20221 unsigned NumInScalars = N->getNumOperands();
20223 // Look through bitcasts
20224 auto PeekThroughBitcast = [](SDValue Op) {
20225 if (Op.getOpcode() == ISD::BITCAST)
20226 return Op.getOperand(0);
20230 // The source value where all the parts are extracted.
20232 for (unsigned i = 0; i != NumInScalars; ++i) {
20233 SDValue In = PeekThroughBitcast(N->getOperand(i));
20234 // Ignore undef inputs.
20235 if (In.isUndef()) continue;
20237 if (In.getOpcode() != ISD::TRUNCATE)
20240 In = PeekThroughBitcast(In.getOperand(0));
20242 if (In.getOpcode() != ISD::SRL) {
20243 // For now only build_vec without shuffling, handle shifts here in the
20251 SDValue part = PeekThroughBitcast(In.getOperand(0));
20255 } else if (Src != part) {
20256 // Vector parts do not stem from the same variable
20260 SDValue ShiftAmtVal = In.getOperand(1);
20261 if (!isa<ConstantSDNode>(ShiftAmtVal))
20264 uint64_t ShiftAmt = In.getConstantOperandVal(1);
20266 // The extracted value is not extracted at the right position
20267 if (ShiftAmt != i * ScalarTypeBitsize)
20272 // Only cast if the size is the same
20273 if (Src.getValueType().getSizeInBits() != VT.getSizeInBits())
20276 return DAG.getBitcast(VT, Src);
20279 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
20280 ArrayRef<int> VectorMask,
20281 SDValue VecIn1, SDValue VecIn2,
20282 unsigned LeftIdx, bool DidSplitVec) {
20283 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20285 EVT VT = N->getValueType(0);
20286 EVT InVT1 = VecIn1.getValueType();
20287 EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
20289 unsigned NumElems = VT.getVectorNumElements();
20290 unsigned ShuffleNumElems = NumElems;
20292 // If we artificially split a vector in two already, then the offsets in the
20293 // operands will all be based off of VecIn1, even those in VecIn2.
20294 unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();
20296 uint64_t VTSize = VT.getFixedSizeInBits();
20297 uint64_t InVT1Size = InVT1.getFixedSizeInBits();
20298 uint64_t InVT2Size = InVT2.getFixedSizeInBits();
20300 assert(InVT2Size <= InVT1Size &&
20301 "Inputs must be sorted to be in non-increasing vector size order.");
20303 // We can't generate a shuffle node with mismatched input and output types.
20304 // Try to make the types match the type of the output.
20305 if (InVT1 != VT || InVT2 != VT) {
20306 if ((VTSize % InVT1Size == 0) && InVT1 == InVT2) {
20307 // If the output vector length is a multiple of both input lengths,
20308 // we can concatenate them and pad the rest with undefs.
20309 unsigned NumConcats = VTSize / InVT1Size;
20310 assert(NumConcats >= 2 && "Concat needs at least two inputs!");
20311 SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
20312 ConcatOps[0] = VecIn1;
20313 ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
20314 VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
20315 VecIn2 = SDValue();
20316 } else if (InVT1Size == VTSize * 2) {
20317 if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
20320 if (!VecIn2.getNode()) {
20321 // If we only have one input vector, and it's twice the size of the
20322 // output, split it in two.
20323 VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
20324 DAG.getVectorIdxConstant(NumElems, DL));
20325 VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
20326 // Since we now have shorter input vectors, adjust the offset of the
20327 // second vector's start.
20328 Vec2Offset = NumElems;
20330 assert(InVT2Size <= InVT1Size &&
20331 "Second input is not going to be larger than the first one.");
20333 // VecIn1 is wider than the output, and we have another, possibly
20334 // smaller input. Pad the smaller input with undefs, shuffle at the
20335 // input vector width, and extract the output.
20336 // The shuffle type is different than VT, so check legality again.
20337 if (LegalOperations &&
20338 !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
20341 // Legalizing INSERT_SUBVECTOR is tricky - you basically have to
20342 // lower it back into a BUILD_VECTOR. So if the inserted type is
20343 // illegal, don't even try.
20344 if (InVT1 != InVT2) {
20345 if (!TLI.isTypeLegal(InVT2))
20347 VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
20348 DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
20350 ShuffleNumElems = NumElems * 2;
20352 } else if (InVT2Size * 2 == VTSize && InVT1Size == VTSize) {
20353 SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
20354 ConcatOps[0] = VecIn2;
20355 VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
20357 // TODO: Support cases where the length mismatch isn't exactly by a
20359 // TODO: Move this check upwards, so that if we have bad type
20360 // mismatches, we don't create any DAG nodes.
20365 // Initialize mask to undef.
20366 SmallVector<int, 8> Mask(ShuffleNumElems, -1);
20368 // Only need to run up to the number of elements actually used, not the
20369 // total number of elements in the shuffle - if we are shuffling a wider
20370 // vector, the high lanes should be set to undef.
20371 for (unsigned i = 0; i != NumElems; ++i) {
20372 if (VectorMask[i] <= 0)
20375 unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
20376 if (VectorMask[i] == (int)LeftIdx) {
20377 Mask[i] = ExtIndex;
20378 } else if (VectorMask[i] == (int)LeftIdx + 1) {
20379 Mask[i] = Vec2Offset + ExtIndex;
20383 // The type the input vectors may have changed above.
20384 InVT1 = VecIn1.getValueType();
20386 // If we already have a VecIn2, it should have the same type as VecIn1.
20387 // If we don't, get an undef/zero vector of the appropriate type.
20388 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
20389 assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");
20391 SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
20392 if (ShuffleNumElems > NumElems)
20393 Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);
20398 static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
20399 assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
20401 // First, determine where the build vector is not undef.
20402 // TODO: We could extend this to handle zero elements as well as undefs.
20403 int NumBVOps = BV->getNumOperands();
20405 for (int i = 0; i != NumBVOps; ++i) {
20406 SDValue Op = BV->getOperand(i);
20414 // Bail out if there's no non-undef element.
20418 // The build vector contains some number of undef elements and exactly
20419 // one other element. That other element must be a zero-extended scalar
20420 // extracted from a vector at a constant index to turn this into a shuffle.
20421 // Also, require that the build vector does not implicitly truncate/extend
20423 // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND.
20424 EVT VT = BV->getValueType(0);
20425 SDValue Zext = BV->getOperand(ZextElt);
20426 if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() ||
20427 Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20428 !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) ||
20429 Zext.getValueSizeInBits() != VT.getScalarSizeInBits())
20432 // The zero-extend must be a multiple of the source size, and we must be
20433 // building a vector of the same size as the source of the extract element.
20434 SDValue Extract = Zext.getOperand(0);
20435 unsigned DestSize = Zext.getValueSizeInBits();
20436 unsigned SrcSize = Extract.getValueSizeInBits();
20437 if (DestSize % SrcSize != 0 ||
20438 Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits())
20441 // Create a shuffle mask that will combine the extracted element with zeros
20443 int ZextRatio = DestSize / SrcSize;
20444 int NumMaskElts = NumBVOps * ZextRatio;
20445 SmallVector<int, 32> ShufMask(NumMaskElts, -1);
20446 for (int i = 0; i != NumMaskElts; ++i) {
20447 if (i / ZextRatio == ZextElt) {
20448 // The low bits of the (potentially translated) extracted element map to
20449 // the source vector. The high bits map to zero. We will use a zero vector
20450 // as the 2nd source operand of the shuffle, so use the 1st element of
20451 // that vector (mask value is number-of-elements) for the high bits.
20452 if (i % ZextRatio == 0)
20453 ShufMask[i] = Extract.getConstantOperandVal(1);
20455 ShufMask[i] = NumMaskElts;
20458 // Undef elements of the build vector remain undef because we initialize
20459 // the shuffle mask with -1.
20462 // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
20463 // bitcast (shuffle V, ZeroVec, VectorMask)
20465 EVT VecVT = Extract.getOperand(0).getValueType();
20466 SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
20467 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20468 SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0),
20469 ZeroVec, ShufMask, DAG);
20472 return DAG.getBitcast(VT, Shuf);
20475 // FIXME: promote to STLExtras.
20476 template <typename R, typename T>
20477 static auto getFirstIndexOf(R &&Range, const T &Val) {
20478 auto I = find(Range, Val);
20479 if (I == Range.end())
20480 return static_cast<decltype(std::distance(Range.begin(), I))>(-1);
20481 return std::distance(Range.begin(), I);
20484 // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
20485 // operations. If the types of the vectors we're extracting from allow it,
20486 // turn this into a vector_shuffle node.
20487 SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
20489 EVT VT = N->getValueType(0);
20491 // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
20492 if (!isTypeLegal(VT))
20495 if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG))
20498 // May only combine to shuffle after legalize if shuffle is legal.
20499 if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
20502 bool UsesZeroVector = false;
20503 unsigned NumElems = N->getNumOperands();
20505 // Record, for each element of the newly built vector, which input vector
20506 // that element comes from. -1 stands for undef, 0 for the zero vector,
20507 // and positive values for the input vectors.
20508 // VectorMask maps each element to its vector number, and VecIn maps vector
20509 // numbers to their initial SDValues.
20511 SmallVector<int, 8> VectorMask(NumElems, -1);
20512 SmallVector<SDValue, 8> VecIn;
20513 VecIn.push_back(SDValue());
20515 for (unsigned i = 0; i != NumElems; ++i) {
20516 SDValue Op = N->getOperand(i);
20521 // See if we can use a blend with a zero vector.
20522 // TODO: Should we generalize this to a blend with an arbitrary constant
20524 if (isNullConstant(Op) || isNullFPConstant(Op)) {
20525 UsesZeroVector = true;
20530 // Not an undef or zero. If the input is something other than an
20531 // EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
20532 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20533 !isa<ConstantSDNode>(Op.getOperand(1)))
20535 SDValue ExtractedFromVec = Op.getOperand(0);
20537 if (ExtractedFromVec.getValueType().isScalableVector())
20540 const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
20541 if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
20544 // All inputs must have the same element type as the output.
20545 if (VT.getVectorElementType() !=
20546 ExtractedFromVec.getValueType().getVectorElementType())
20549 // Have we seen this input vector before?
20550 // The vectors are expected to be tiny (usually 1 or 2 elements), so using
20551 // a map back from SDValues to numbers isn't worth it.
20552 int Idx = getFirstIndexOf(VecIn, ExtractedFromVec);
20553 if (Idx == -1) { // A new source vector?
20554 Idx = VecIn.size();
20555 VecIn.push_back(ExtractedFromVec);
20558 VectorMask[i] = Idx;
20561 // If we didn't find at least one input vector, bail out.
20562 if (VecIn.size() < 2)
20565 // If all the Operands of BUILD_VECTOR extract from same
20566 // vector, then split the vector efficiently based on the maximum
20567 // vector access index and adjust the VectorMask and
20568 // VecIn accordingly.
20569 bool DidSplitVec = false;
20570 if (VecIn.size() == 2) {
20571 unsigned MaxIndex = 0;
20572 unsigned NearestPow2 = 0;
20573 SDValue Vec = VecIn.back();
20574 EVT InVT = Vec.getValueType();
20575 SmallVector<unsigned, 8> IndexVec(NumElems, 0);
20577 for (unsigned i = 0; i < NumElems; i++) {
20578 if (VectorMask[i] <= 0)
20580 unsigned Index = N->getOperand(i).getConstantOperandVal(1);
20581 IndexVec[i] = Index;
20582 MaxIndex = std::max(MaxIndex, Index);
20585 NearestPow2 = PowerOf2Ceil(MaxIndex);
20586 if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
20587 NumElems * 2 < NearestPow2) {
20588 unsigned SplitSize = NearestPow2 / 2;
20589 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
20590 InVT.getVectorElementType(), SplitSize);
20591 if (TLI.isTypeLegal(SplitVT) &&
20592 SplitSize + SplitVT.getVectorNumElements() <=
20593 InVT.getVectorNumElements()) {
20594 SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
20595 DAG.getVectorIdxConstant(SplitSize, DL));
20596 SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
20597 DAG.getVectorIdxConstant(0, DL));
20599 VecIn.push_back(VecIn1);
20600 VecIn.push_back(VecIn2);
20601 DidSplitVec = true;
20603 for (unsigned i = 0; i < NumElems; i++) {
20604 if (VectorMask[i] <= 0)
20606 VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
20612 // Sort input vectors by decreasing vector element count,
20613 // while preserving the relative order of equally-sized vectors.
20614 // Note that we keep the first "implicit zero vector as-is.
20615 SmallVector<SDValue, 8> SortedVecIn(VecIn);
20616 llvm::stable_sort(MutableArrayRef<SDValue>(SortedVecIn).drop_front(),
20617 [](const SDValue &a, const SDValue &b) {
20618 return a.getValueType().getVectorNumElements() >
20619 b.getValueType().getVectorNumElements();
20622 // We now also need to rebuild the VectorMask, because it referenced element
20623 // order in VecIn, and we just sorted them.
20624 for (int &SourceVectorIndex : VectorMask) {
20625 if (SourceVectorIndex <= 0)
20627 unsigned Idx = getFirstIndexOf(SortedVecIn, VecIn[SourceVectorIndex]);
20628 assert(Idx > 0 && Idx < SortedVecIn.size() &&
20629 VecIn[SourceVectorIndex] == SortedVecIn[Idx] && "Remapping failure");
20630 SourceVectorIndex = Idx;
20633 VecIn = std::move(SortedVecIn);
20635 // TODO: Should this fire if some of the input vectors has illegal type (like
20636 // it does now), or should we let legalization run its course first?
20639 // Take pairs of vectors, and shuffle them so that the result has elements
20640 // from these vectors in the correct places.
20641 // For example, given:
20642 // t10: i32 = extract_vector_elt t1, Constant:i64<0>
20643 // t11: i32 = extract_vector_elt t2, Constant:i64<0>
20644 // t12: i32 = extract_vector_elt t3, Constant:i64<0>
20645 // t13: i32 = extract_vector_elt t1, Constant:i64<1>
20646 // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
20647 // We will generate:
20648 // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
20649 // t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
20650 SmallVector<SDValue, 4> Shuffles;
20651 for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
20652 unsigned LeftIdx = 2 * In + 1;
20653 SDValue VecLeft = VecIn[LeftIdx];
20655 (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();
20657 if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
20658 VecRight, LeftIdx, DidSplitVec))
20659 Shuffles.push_back(Shuffle);
20664 // If we need the zero vector as an "ingredient" in the blend tree, add it
20665 // to the list of shuffles.
20666 if (UsesZeroVector)
20667 Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
20668 : DAG.getConstantFP(0.0, DL, VT));
20670 // If we only have one shuffle, we're done.
20671 if (Shuffles.size() == 1)
20672 return Shuffles[0];
20674 // Update the vector mask to point to the post-shuffle vectors.
20675 for (int &Vec : VectorMask)
20677 Vec = Shuffles.size() - 1;
20679 Vec = (Vec - 1) / 2;
20681 // More than one shuffle. Generate a binary tree of blends, e.g. if from
20682 // the previous step we got the set of shuffles t10, t11, t12, t13, we will
20684 // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
20685 // t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
20686 // t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
20687 // t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
20688 // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
20689 // t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
20690 // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21
20692 // Make sure the initial size of the shuffle list is even.
20693 if (Shuffles.size() % 2)
20694 Shuffles.push_back(DAG.getUNDEF(VT));
20696 for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
20698 Shuffles[CurSize] = DAG.getUNDEF(VT);
20701 for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
20703 int Right = 2 * In + 1;
20704 SmallVector<int, 8> Mask(NumElems, -1);
20705 SDValue L = Shuffles[Left];
20706 ArrayRef<int> LMask;
20707 bool IsLeftShuffle = L.getOpcode() == ISD::VECTOR_SHUFFLE &&
20708 L.use_empty() && L.getOperand(1).isUndef() &&
20709 L.getOperand(0).getValueType() == L.getValueType();
20710 if (IsLeftShuffle) {
20711 LMask = cast<ShuffleVectorSDNode>(L.getNode())->getMask();
20712 L = L.getOperand(0);
20714 SDValue R = Shuffles[Right];
20715 ArrayRef<int> RMask;
20716 bool IsRightShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE &&
20717 R.use_empty() && R.getOperand(1).isUndef() &&
20718 R.getOperand(0).getValueType() == R.getValueType();
20719 if (IsRightShuffle) {
20720 RMask = cast<ShuffleVectorSDNode>(R.getNode())->getMask();
20721 R = R.getOperand(0);
20723 for (unsigned I = 0; I != NumElems; ++I) {
20724 if (VectorMask[I] == Left) {
20727 Mask[I] = LMask[I];
20728 VectorMask[I] = In;
20729 } else if (VectorMask[I] == Right) {
20730 Mask[I] = I + NumElems;
20731 if (IsRightShuffle)
20732 Mask[I] = RMask[I] + NumElems;
20733 VectorMask[I] = In;
20737 Shuffles[In] = DAG.getVectorShuffle(VT, DL, L, R, Mask);
20740 return Shuffles[0];
20743 // Try to turn a build vector of zero extends of extract vector elts into a
20744 // a vector zero extend and possibly an extract subvector.
20745 // TODO: Support sign extend?
20746 // TODO: Allow undef elements?
20747 SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
20748 if (LegalOperations)
20751 EVT VT = N->getValueType(0);
20753 bool FoundZeroExtend = false;
20754 SDValue Op0 = N->getOperand(0);
20755 auto checkElem = [&](SDValue Op) -> int64_t {
20756 unsigned Opc = Op.getOpcode();
20757 FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND);
20758 if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) &&
20759 Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20760 Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
20761 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
20762 return C->getZExtValue();
20766 // Make sure the first element matches
20767 // (zext (extract_vector_elt X, C))
20768 // Offset must be a constant multiple of the
20769 // known-minimum vector length of the result type.
20770 int64_t Offset = checkElem(Op0);
20771 if (Offset < 0 || (Offset % VT.getVectorNumElements()) != 0)
20774 unsigned NumElems = N->getNumOperands();
20775 SDValue In = Op0.getOperand(0).getOperand(0);
20776 EVT InSVT = In.getValueType().getScalarType();
20777 EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems);
20779 // Don't create an illegal input type after type legalization.
20780 if (LegalTypes && !TLI.isTypeLegal(InVT))
20783 // Ensure all the elements come from the same vector and are adjacent.
20784 for (unsigned i = 1; i != NumElems; ++i) {
20785 if ((Offset + i) != checkElem(N->getOperand(i)))
20790 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In,
20791 Op0.getOperand(0).getOperand(1));
20792 return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL,
20796 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
20797 EVT VT = N->getValueType(0);
20799 // A vector built entirely of undefs is undef.
20800 if (ISD::allOperandsUndef(N))
20801 return DAG.getUNDEF(VT);
20803 // If this is a splat of a bitcast from another vector, change to a
20806 // (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) ->
20807 // (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X))))
20809 // If X is a build_vector itself, the concat can become a larger build_vector.
20810 // TODO: Maybe this is useful for non-splat too?
20811 if (!LegalOperations) {
20812 if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) {
20813 Splat = peekThroughBitcasts(Splat);
20814 EVT SrcVT = Splat.getValueType();
20815 if (SrcVT.isVector()) {
20816 unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements();
20817 EVT NewVT = EVT::getVectorVT(*DAG.getContext(),
20818 SrcVT.getVectorElementType(), NumElts);
20819 if (!LegalTypes || TLI.isTypeLegal(NewVT)) {
20820 SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat);
20821 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N),
20823 return DAG.getBitcast(VT, Concat);
20829 // Check if we can express BUILD VECTOR via subvector extract.
20830 if (!LegalTypes && (N->getNumOperands() > 1)) {
20831 SDValue Op0 = N->getOperand(0);
20832 auto checkElem = [&](SDValue Op) -> uint64_t {
20833 if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
20834 (Op0.getOperand(0) == Op.getOperand(0)))
20835 if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
20836 return CNode->getZExtValue();
20840 int Offset = checkElem(Op0);
20841 for (unsigned i = 0; i < N->getNumOperands(); ++i) {
20842 if (Offset + i != checkElem(N->getOperand(i))) {
20848 if ((Offset == 0) &&
20849 (Op0.getOperand(0).getValueType() == N->getValueType(0)))
20850 return Op0.getOperand(0);
20851 if ((Offset != -1) &&
20852 ((Offset % N->getValueType(0).getVectorNumElements()) ==
20853 0)) // IDX must be multiple of output size.
20854 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
20855 Op0.getOperand(0), Op0.getOperand(1));
20858 if (SDValue V = convertBuildVecZextToZext(N))
20861 if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
20864 if (SDValue V = reduceBuildVecTruncToBitCast(N))
20867 if (SDValue V = reduceBuildVecToShuffle(N))
20870 // A splat of a single element is a SPLAT_VECTOR if supported on the target.
20871 // Do this late as some of the above may replace the splat.
20872 if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand)
20873 if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
20874 assert(!V.isUndef() && "Splat of undef should have been handled earlier");
20875 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V);
20881 static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
20882 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20883 EVT OpVT = N->getOperand(0).getValueType();
20885 // If the operands are legal vectors, leave them alone.
20886 if (TLI.isTypeLegal(OpVT))
20890 EVT VT = N->getValueType(0);
20891 SmallVector<SDValue, 8> Ops;
20893 EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
20894 SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
20896 // Keep track of what we encounter.
20897 bool AnyInteger = false;
20898 bool AnyFP = false;
20899 for (const SDValue &Op : N->ops()) {
20900 if (ISD::BITCAST == Op.getOpcode() &&
20901 !Op.getOperand(0).getValueType().isVector())
20902 Ops.push_back(Op.getOperand(0));
20903 else if (ISD::UNDEF == Op.getOpcode())
20904 Ops.push_back(ScalarUndef);
20908 // Note whether we encounter an integer or floating point scalar.
20909 // If it's neither, bail out, it could be something weird like x86mmx.
20910 EVT LastOpVT = Ops.back().getValueType();
20911 if (LastOpVT.isFloatingPoint())
20913 else if (LastOpVT.isInteger())
20919 // If any of the operands is a floating point scalar bitcast to a vector,
20920 // use floating point types throughout, and bitcast everything.
20921 // Replace UNDEFs by another scalar UNDEF node, of the final desired type.
20923 SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
20924 ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
20926 for (SDValue &Op : Ops) {
20927 if (Op.getValueType() == SVT)
20932 Op = DAG.getBitcast(SVT, Op);
20937 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
20938 VT.getSizeInBits() / SVT.getSizeInBits());
20939 return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
20942 // Attempt to merge nested concat_vectors/undefs.
20943 // Fold concat_vectors(concat_vectors(x,y,z,w),u,u,concat_vectors(a,b,c,d))
20944 // --> concat_vectors(x,y,z,w,u,u,u,u,u,u,u,u,a,b,c,d)
20945 static SDValue combineConcatVectorOfConcatVectors(SDNode *N,
20946 SelectionDAG &DAG) {
20947 EVT VT = N->getValueType(0);
20949 // Ensure we're concatenating UNDEF and CONCAT_VECTORS nodes of similar types.
20951 SDValue FirstConcat;
20952 for (const SDValue &Op : N->ops()) {
20955 if (Op.getOpcode() != ISD::CONCAT_VECTORS)
20957 if (!FirstConcat) {
20958 SubVT = Op.getOperand(0).getValueType();
20959 if (!DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20964 if (SubVT != Op.getOperand(0).getValueType())
20967 assert(FirstConcat && "Concat of all-undefs found");
20969 SmallVector<SDValue> ConcatOps;
20970 for (const SDValue &Op : N->ops()) {
20971 if (Op.isUndef()) {
20972 ConcatOps.append(FirstConcat->getNumOperands(), DAG.getUNDEF(SubVT));
20975 ConcatOps.append(Op->op_begin(), Op->op_end());
20977 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, ConcatOps);
20980 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
20981 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
20982 // most two distinct vectors the same size as the result, attempt to turn this
20983 // into a legal shuffle.
20984 static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
20985 EVT VT = N->getValueType(0);
20986 EVT OpVT = N->getOperand(0).getValueType();
20988 // We currently can't generate an appropriate shuffle for a scalable vector.
20989 if (VT.isScalableVector())
20992 int NumElts = VT.getVectorNumElements();
20993 int NumOpElts = OpVT.getVectorNumElements();
20995 SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
20996 SmallVector<int, 8> Mask;
20998 for (SDValue Op : N->ops()) {
20999 Op = peekThroughBitcasts(Op);
21001 // UNDEF nodes convert to UNDEF shuffle mask values.
21002 if (Op.isUndef()) {
21003 Mask.append((unsigned)NumOpElts, -1);
21007 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21010 // What vector are we extracting the subvector from and at what index?
21011 SDValue ExtVec = Op.getOperand(0);
21012 int ExtIdx = Op.getConstantOperandVal(1);
21014 // We want the EVT of the original extraction to correctly scale the
21015 // extraction index.
21016 EVT ExtVT = ExtVec.getValueType();
21017 ExtVec = peekThroughBitcasts(ExtVec);
21019 // UNDEF nodes convert to UNDEF shuffle mask values.
21020 if (ExtVec.isUndef()) {
21021 Mask.append((unsigned)NumOpElts, -1);
21025 // Ensure that we are extracting a subvector from a vector the same
21026 // size as the result.
21027 if (ExtVT.getSizeInBits() != VT.getSizeInBits())
21030 // Scale the subvector index to account for any bitcast.
21031 int NumExtElts = ExtVT.getVectorNumElements();
21032 if (0 == (NumExtElts % NumElts))
21033 ExtIdx /= (NumExtElts / NumElts);
21034 else if (0 == (NumElts % NumExtElts))
21035 ExtIdx *= (NumElts / NumExtElts);
21039 // At most we can reference 2 inputs in the final shuffle.
21040 if (SV0.isUndef() || SV0 == ExtVec) {
21042 for (int i = 0; i != NumOpElts; ++i)
21043 Mask.push_back(i + ExtIdx);
21044 } else if (SV1.isUndef() || SV1 == ExtVec) {
21046 for (int i = 0; i != NumOpElts; ++i)
21047 Mask.push_back(i + ExtIdx + NumElts);
21053 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21054 return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
21055 DAG.getBitcast(VT, SV1), Mask, DAG);
21058 static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
21059 unsigned CastOpcode = N->getOperand(0).getOpcode();
21060 switch (CastOpcode) {
21061 case ISD::SINT_TO_FP:
21062 case ISD::UINT_TO_FP:
21063 case ISD::FP_TO_SINT:
21064 case ISD::FP_TO_UINT:
21065 // TODO: Allow more opcodes?
21066 // case ISD::BITCAST:
21067 // case ISD::TRUNCATE:
21068 // case ISD::ZERO_EXTEND:
21069 // case ISD::SIGN_EXTEND:
21070 // case ISD::FP_EXTEND:
21076 EVT SrcVT = N->getOperand(0).getOperand(0).getValueType();
21077 if (!SrcVT.isVector())
21080 // All operands of the concat must be the same kind of cast from the same
21082 SmallVector<SDValue, 4> SrcOps;
21083 for (SDValue Op : N->ops()) {
21084 if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() ||
21085 Op.getOperand(0).getValueType() != SrcVT)
21087 SrcOps.push_back(Op.getOperand(0));
21090 // The wider cast must be supported by the target. This is unusual because
21091 // the operation support type parameter depends on the opcode. In addition,
21092 // check the other type in the cast to make sure this is really legal.
21093 EVT VT = N->getValueType(0);
21094 EVT SrcEltVT = SrcVT.getVectorElementType();
21095 ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands();
21096 EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
21097 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21098 switch (CastOpcode) {
21099 case ISD::SINT_TO_FP:
21100 case ISD::UINT_TO_FP:
21101 if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) ||
21102 !TLI.isTypeLegal(VT))
21105 case ISD::FP_TO_SINT:
21106 case ISD::FP_TO_UINT:
21107 if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) ||
21108 !TLI.isTypeLegal(ConcatSrcVT))
21112 llvm_unreachable("Unexpected cast opcode");
21115 // concat (cast X), (cast Y)... -> cast (concat X, Y...)
21117 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps);
21118 return DAG.getNode(CastOpcode, DL, VT, NewConcat);
21121 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
21122 // If we only have one input vector, we don't need to do any concatenation.
21123 if (N->getNumOperands() == 1)
21124 return N->getOperand(0);
21126 // Check if all of the operands are undefs.
21127 EVT VT = N->getValueType(0);
21128 if (ISD::allOperandsUndef(N))
21129 return DAG.getUNDEF(VT);
21131 // Optimize concat_vectors where all but the first of the vectors are undef.
21132 if (all_of(drop_begin(N->ops()),
21133 [](const SDValue &Op) { return Op.isUndef(); })) {
21134 SDValue In = N->getOperand(0);
21135 assert(In.getValueType().isVector() && "Must concat vectors");
21137 // If the input is a concat_vectors, just make a larger concat by padding
21138 // with smaller undefs.
21139 if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) {
21140 unsigned NumOps = N->getNumOperands() * In.getNumOperands();
21141 SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
21142 Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
21143 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
21146 SDValue Scalar = peekThroughOneUseBitcasts(In);
21148 // concat_vectors(scalar_to_vector(scalar), undef) ->
21149 // scalar_to_vector(scalar)
21150 if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR &&
21151 Scalar.hasOneUse()) {
21152 EVT SVT = Scalar.getValueType().getVectorElementType();
21153 if (SVT == Scalar.getOperand(0).getValueType())
21154 Scalar = Scalar.getOperand(0);
21157 // concat_vectors(scalar, undef) -> scalar_to_vector(scalar)
21158 if (!Scalar.getValueType().isVector()) {
21159 // If the bitcast type isn't legal, it might be a trunc of a legal type;
21160 // look through the trunc so we can still do the transform:
21161 // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
21162 if (Scalar->getOpcode() == ISD::TRUNCATE &&
21163 !TLI.isTypeLegal(Scalar.getValueType()) &&
21164 TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
21165 Scalar = Scalar->getOperand(0);
21167 EVT SclTy = Scalar.getValueType();
21169 if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
21172 // Bail out if the vector size is not a multiple of the scalar size.
21173 if (VT.getSizeInBits() % SclTy.getSizeInBits())
21176 unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
21177 if (VNTNumElms < 2)
21180 EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
21181 if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
21184 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
21185 return DAG.getBitcast(VT, Res);
21189 // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
21190 // We have already tested above for an UNDEF only concatenation.
21191 // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
21192 // -> (BUILD_VECTOR A, B, ..., C, D, ...)
21193 auto IsBuildVectorOrUndef = [](const SDValue &Op) {
21194 return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
21196 if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
21197 SmallVector<SDValue, 8> Opnds;
21198 EVT SVT = VT.getScalarType();
21201 if (!SVT.isFloatingPoint()) {
21202 // If BUILD_VECTOR are from built from integer, they may have different
21203 // operand types. Get the smallest type and truncate all operands to it.
21204 bool FoundMinVT = false;
21205 for (const SDValue &Op : N->ops())
21206 if (ISD::BUILD_VECTOR == Op.getOpcode()) {
21207 EVT OpSVT = Op.getOperand(0).getValueType();
21208 MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
21211 assert(FoundMinVT && "Concat vector type mismatch");
21214 for (const SDValue &Op : N->ops()) {
21215 EVT OpVT = Op.getValueType();
21216 unsigned NumElts = OpVT.getVectorNumElements();
21218 if (ISD::UNDEF == Op.getOpcode())
21219 Opnds.append(NumElts, DAG.getUNDEF(MinVT));
21221 if (ISD::BUILD_VECTOR == Op.getOpcode()) {
21222 if (SVT.isFloatingPoint()) {
21223 assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
21224 Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
21226 for (unsigned i = 0; i != NumElts; ++i)
21228 DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
21233 assert(VT.getVectorNumElements() == Opnds.size() &&
21234 "Concat vector type mismatch");
21235 return DAG.getBuildVector(VT, SDLoc(N), Opnds);
21238 // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
21239 // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...).
21240 if (SDValue V = combineConcatVectorOfScalars(N, DAG))
21243 if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) {
21244 // Fold CONCAT_VECTORS of CONCAT_VECTORS (or undef) to VECTOR_SHUFFLE.
21245 if (SDValue V = combineConcatVectorOfConcatVectors(N, DAG))
21248 // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
21249 if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
21253 if (SDValue V = combineConcatVectorOfCasts(N, DAG))
21256 // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
21257 // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR
21258 // operands and look for a CONCAT operations that place the incoming vectors
21259 // at the exact same location.
21261 // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled.
21262 SDValue SingleSource = SDValue();
21263 unsigned PartNumElem =
21264 N->getOperand(0).getValueType().getVectorMinNumElements();
21266 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
21267 SDValue Op = N->getOperand(i);
21272 // Check if this is the identity extract:
21273 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21276 // Find the single incoming vector for the extract_subvector.
21277 if (SingleSource.getNode()) {
21278 if (Op.getOperand(0) != SingleSource)
21281 SingleSource = Op.getOperand(0);
21283 // Check the source type is the same as the type of the result.
21284 // If not, this concat may extend the vector, so we can not
21285 // optimize it away.
21286 if (SingleSource.getValueType() != N->getValueType(0))
21290 // Check that we are reading from the identity index.
21291 unsigned IdentityIndex = i * PartNumElem;
21292 if (Op.getConstantOperandAPInt(1) != IdentityIndex)
21296 if (SingleSource.getNode())
21297 return SingleSource;
21302 // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
21303 // if the subvector can be sourced for free.
21304 static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
21305 if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
21306 V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) {
21307 return V.getOperand(1);
21309 auto *IndexC = dyn_cast<ConstantSDNode>(Index);
21310 if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
21311 V.getOperand(0).getValueType() == SubVT &&
21312 (IndexC->getZExtValue() % SubVT.getVectorMinNumElements()) == 0) {
21313 uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorMinNumElements();
21314 return V.getOperand(SubIdx);
21319 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
21321 bool LegalOperations) {
21322 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21323 SDValue BinOp = Extract->getOperand(0);
21324 unsigned BinOpcode = BinOp.getOpcode();
21325 if (!TLI.isBinOp(BinOpcode) || BinOp->getNumValues() != 1)
21328 EVT VecVT = BinOp.getValueType();
21329 SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1);
21330 if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType())
21333 SDValue Index = Extract->getOperand(1);
21334 EVT SubVT = Extract->getValueType(0);
21335 if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT, LegalOperations))
21338 SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
21339 SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT);
21341 // TODO: We could handle the case where only 1 operand is being inserted by
21342 // creating an extract of the other operand, but that requires checking
21343 // number of uses and/or costs.
21344 if (!Sub0 || !Sub1)
21347 // We are inserting both operands of the wide binop only to extract back
21348 // to the narrow vector size. Eliminate all of the insert/extract:
21349 // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y
21350 return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1,
21351 BinOp->getFlags());
21354 /// If we are extracting a subvector produced by a wide binary operator try
21355 /// to use a narrow binary operator and/or avoid concatenation and extraction.
21356 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
21357 bool LegalOperations) {
21358 // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
21359 // some of these bailouts with other transforms.
21361 if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG, LegalOperations))
21364 // The extract index must be a constant, so we can map it to a concat operand.
21365 auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
21366 if (!ExtractIndexC)
21369 // We are looking for an optionally bitcasted wide vector binary operator
21370 // feeding an extract subvector.
21371 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21372 SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
21373 unsigned BOpcode = BinOp.getOpcode();
21374 if (!TLI.isBinOp(BOpcode) || BinOp->getNumValues() != 1)
21377 // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
21378 // reduced to the unary fneg when it is visited, and we probably want to deal
21379 // with fneg in a target-specific way.
21380 if (BOpcode == ISD::FSUB) {
21381 auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true);
21382 if (C && C->getValueAPF().isNegZero())
21386 // The binop must be a vector type, so we can extract some fraction of it.
21387 EVT WideBVT = BinOp.getValueType();
21388 // The optimisations below currently assume we are dealing with fixed length
21389 // vectors. It is possible to add support for scalable vectors, but at the
21390 // moment we've done no analysis to prove whether they are profitable or not.
21391 if (!WideBVT.isFixedLengthVector())
21394 EVT VT = Extract->getValueType(0);
21395 unsigned ExtractIndex = ExtractIndexC->getZExtValue();
21396 assert(ExtractIndex % VT.getVectorNumElements() == 0 &&
21397 "Extract index is not a multiple of the vector length.");
21399 // Bail out if this is not a proper multiple width extraction.
21400 unsigned WideWidth = WideBVT.getSizeInBits();
21401 unsigned NarrowWidth = VT.getSizeInBits();
21402 if (WideWidth % NarrowWidth != 0)
21405 // Bail out if we are extracting a fraction of a single operation. This can
21406 // occur because we potentially looked through a bitcast of the binop.
21407 unsigned NarrowingRatio = WideWidth / NarrowWidth;
21408 unsigned WideNumElts = WideBVT.getVectorNumElements();
21409 if (WideNumElts % NarrowingRatio != 0)
21412 // Bail out if the target does not support a narrower version of the binop.
21413 EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
21414 WideNumElts / NarrowingRatio);
21415 if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
21418 // If extraction is cheap, we don't need to look at the binop operands
21419 // for concat ops. The narrow binop alone makes this transform profitable.
21420 // We can't just reuse the original extract index operand because we may have
21422 unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
21423 unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
21424 if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
21425 BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
21426 // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
21428 SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);
21429 SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21430 BinOp.getOperand(0), NewExtIndex);
21431 SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21432 BinOp.getOperand(1), NewExtIndex);
21433 SDValue NarrowBinOp =
21434 DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, BinOp->getFlags());
21435 return DAG.getBitcast(VT, NarrowBinOp);
21438 // Only handle the case where we are doubling and then halving. A larger ratio
21439 // may require more than two narrow binops to replace the wide binop.
21440 if (NarrowingRatio != 2)
21443 // TODO: The motivating case for this transform is an x86 AVX1 target. That
21444 // target has temptingly almost legal versions of bitwise logic ops in 256-bit
21445 // flavors, but no other 256-bit integer support. This could be extended to
21446 // handle any binop, but that may require fixing/adding other folds to avoid
21447 // codegen regressions.
21448 if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
21451 // We need at least one concatenation operation of a binop operand to make
21452 // this transform worthwhile. The concat must double the input vector sizes.
21453 auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue {
21454 if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2)
21455 return V.getOperand(ConcatOpNum);
21458 SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0)));
21459 SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1)));
21461 if (SubVecL || SubVecR) {
21462 // If a binop operand was not the result of a concat, we must extract a
21463 // half-sized operand for our new narrow binop:
21464 // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
21465 // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC)
21466 // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN
21468 SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL);
21469 SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL)
21470 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21471 BinOp.getOperand(0), IndexC);
21473 SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR)
21474 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21475 BinOp.getOperand(1), IndexC);
21477 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
21478 return DAG.getBitcast(VT, NarrowBinOp);
21484 /// If we are extracting a subvector from a wide vector load, convert to a
21485 /// narrow load to eliminate the extraction:
21486 /// (extract_subvector (load wide vector)) --> (load narrow vector)
21487 static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
21488 // TODO: Add support for big-endian. The offset calculation must be adjusted.
21489 if (DAG.getDataLayout().isBigEndian())
21492 auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
21493 if (!Ld || Ld->getExtensionType() || !Ld->isSimple())
21496 // Allow targets to opt-out.
21497 EVT VT = Extract->getValueType(0);
21499 // We can only create byte sized loads.
21500 if (!VT.isByteSized())
21503 unsigned Index = Extract->getConstantOperandVal(1);
21504 unsigned NumElts = VT.getVectorMinNumElements();
21506 // The definition of EXTRACT_SUBVECTOR states that the index must be a
21507 // multiple of the minimum number of elements in the result type.
21508 assert(Index % NumElts == 0 && "The extract subvector index is not a "
21509 "multiple of the result's element count");
21511 // It's fine to use TypeSize here as we know the offset will not be negative.
21512 TypeSize Offset = VT.getStoreSize() * (Index / NumElts);
21514 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21515 if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
21518 // The narrow load will be offset from the base address of the old load if
21519 // we are extracting from something besides index 0 (little-endian).
21522 // TODO: Use "BaseIndexOffset" to make this more effective.
21523 SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL);
21525 uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
21526 MachineFunction &MF = DAG.getMachineFunction();
21527 MachineMemOperand *MMO;
21528 if (Offset.isScalable()) {
21529 MachinePointerInfo MPI =
21530 MachinePointerInfo(Ld->getPointerInfo().getAddrSpace());
21531 MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize);
21533 MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedSize(),
21536 SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
21537 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
21541 /// Given EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)),
21542 /// try to produce VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?),
21543 /// EXTRACT_SUBVECTOR(Op?, ?),
21545 /// iff it is legal and profitable to do so. Notably, the trimmed mask
21546 /// (containing only the elements that are extracted)
21547 /// must reference at most two subvectors.
21548 static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
21550 const TargetLowering &TLI,
21551 bool LegalOperations) {
21552 assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21553 "Must only be called on EXTRACT_SUBVECTOR's");
21555 SDValue N0 = N->getOperand(0);
21557 // Only deal with non-scalable vectors.
21558 EVT NarrowVT = N->getValueType(0);
21559 EVT WideVT = N0.getValueType();
21560 if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector())
21563 // The operand must be a shufflevector.
21564 auto *WideShuffleVector = dyn_cast<ShuffleVectorSDNode>(N0);
21565 if (!WideShuffleVector)
21568 // The old shuffleneeds to go away.
21569 if (!WideShuffleVector->hasOneUse())
21572 // And the narrow shufflevector that we'll form must be legal.
21573 if (LegalOperations &&
21574 !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT))
21577 uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1);
21578 int NumEltsExtracted = NarrowVT.getVectorNumElements();
21579 assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 &&
21580 "Extract index is not a multiple of the output vector length.");
21582 int WideNumElts = WideVT.getVectorNumElements();
21584 SmallVector<int, 16> NewMask;
21585 NewMask.reserve(NumEltsExtracted);
21586 SmallSetVector<std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>, 2>
21587 DemandedSubvectors;
21589 // Try to decode the wide mask into narrow mask from at most two subvectors.
21590 for (int M : WideShuffleVector->getMask().slice(FirstExtractedEltIdx,
21591 NumEltsExtracted)) {
21592 assert((M >= -1) && (M < (2 * WideNumElts)) &&
21593 "Out-of-bounds shuffle mask?");
21596 // Does not depend on operands, does not require adjustment.
21597 NewMask.emplace_back(M);
21601 // From which operand of the shuffle does this shuffle mask element pick?
21602 int WideShufOpIdx = M / WideNumElts;
21603 // Which element of that operand is picked?
21604 int OpEltIdx = M % WideNumElts;
21606 assert((OpEltIdx + WideShufOpIdx * WideNumElts) == M &&
21607 "Shuffle mask vector decomposition failure.");
21609 // And which NumEltsExtracted-sized subvector of that operand is that?
21610 int OpSubvecIdx = OpEltIdx / NumEltsExtracted;
21611 // And which element within that subvector of that operand is that?
21612 int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted;
21614 assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted) == OpEltIdx &&
21615 "Shuffle mask subvector decomposition failure.");
21617 assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted +
21618 WideShufOpIdx * WideNumElts) == M &&
21619 "Shuffle mask full decomposition failure.");
21621 SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx);
21623 if (Op.isUndef()) {
21624 // Picking from an undef operand. Let's adjust mask instead.
21625 NewMask.emplace_back(-1);
21629 // Profitability check: only deal with extractions from the first subvector.
21630 if (OpSubvecIdx != 0)
21633 const std::pair<SDValue, int> DemandedSubvector =
21634 std::make_pair(Op, OpSubvecIdx);
21636 if (DemandedSubvectors.insert(DemandedSubvector)) {
21637 if (DemandedSubvectors.size() > 2)
21638 return SDValue(); // We can't handle more than two subvectors.
21639 // How many elements into the WideVT does this subvector start?
21640 int Index = NumEltsExtracted * OpSubvecIdx;
21641 // Bail out if the extraction isn't going to be cheap.
21642 if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index))
21646 // Ok, but from which operand of the new shuffle will this element pick?
21648 getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector);
21649 assert((NewOpIdx == 0 || NewOpIdx == 1) && "Unexpected operand index.");
21651 int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted;
21652 NewMask.emplace_back(AdjM);
21654 assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask.");
21655 assert(DemandedSubvectors.size() <= 2 &&
21656 "Should have ended up demanding at most two subvectors.");
21658 // Did we discover that the shuffle does not actually depend on operands?
21659 if (DemandedSubvectors.empty())
21660 return DAG.getUNDEF(NarrowVT);
21662 // We still perform the exact same EXTRACT_SUBVECTOR, just on different
21663 // operand[s]/index[es], so there is no point in checking for it's legality.
21665 // Do not turn a legal shuffle into an illegal one.
21666 if (TLI.isShuffleMaskLegal(WideShuffleVector->getMask(), WideVT) &&
21667 !TLI.isShuffleMaskLegal(NewMask, NarrowVT))
21672 SmallVector<SDValue, 2> NewOps;
21673 for (const std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>
21674 &DemandedSubvector : DemandedSubvectors) {
21675 // How many elements into the WideVT does this subvector start?
21676 int Index = NumEltsExtracted * DemandedSubvector.second;
21677 SDValue IndexC = DAG.getVectorIdxConstant(Index, DL);
21678 NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT,
21679 DemandedSubvector.first, IndexC));
21681 assert((NewOps.size() == 1 || NewOps.size() == 2) &&
21682 "Should end up with either one or two ops");
21684 // If we ended up with only one operand, pad with an undef.
21685 if (NewOps.size() == 1)
21686 NewOps.emplace_back(DAG.getUNDEF(NarrowVT));
21688 return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask);
21691 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
21692 EVT NVT = N->getValueType(0);
21693 SDValue V = N->getOperand(0);
21694 uint64_t ExtIdx = N->getConstantOperandVal(1);
21696 // Extract from UNDEF is UNDEF.
21698 return DAG.getUNDEF(NVT);
21700 if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
21701 if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
21704 // Combine an extract of an extract into a single extract_subvector.
21705 // ext (ext X, C), 0 --> ext X, C
21706 if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
21707 if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
21708 V.getConstantOperandVal(1)) &&
21709 TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
21710 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0),
21715 // ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V)
21716 if (V.getOpcode() == ISD::SPLAT_VECTOR)
21717 if (DAG.isConstantValueOfAnyType(V.getOperand(0)) || V.hasOneUse())
21718 if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT))
21719 return DAG.getSplatVector(NVT, SDLoc(N), V.getOperand(0));
21721 // Try to move vector bitcast after extract_subv by scaling extraction index:
21722 // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
21723 if (V.getOpcode() == ISD::BITCAST &&
21724 V.getOperand(0).getValueType().isVector() &&
21725 (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))) {
21726 SDValue SrcOp = V.getOperand(0);
21727 EVT SrcVT = SrcOp.getValueType();
21728 unsigned SrcNumElts = SrcVT.getVectorMinNumElements();
21729 unsigned DestNumElts = V.getValueType().getVectorMinNumElements();
21730 if ((SrcNumElts % DestNumElts) == 0) {
21731 unsigned SrcDestRatio = SrcNumElts / DestNumElts;
21732 ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio;
21733 EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
21735 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
21737 SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL);
21738 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
21739 V.getOperand(0), NewIndex);
21740 return DAG.getBitcast(NVT, NewExtract);
21743 if ((DestNumElts % SrcNumElts) == 0) {
21744 unsigned DestSrcRatio = DestNumElts / SrcNumElts;
21745 if (NVT.getVectorElementCount().isKnownMultipleOf(DestSrcRatio)) {
21746 ElementCount NewExtEC =
21747 NVT.getVectorElementCount().divideCoefficientBy(DestSrcRatio);
21748 EVT ScalarVT = SrcVT.getScalarType();
21749 if ((ExtIdx % DestSrcRatio) == 0) {
21751 unsigned IndexValScaled = ExtIdx / DestSrcRatio;
21753 EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC);
21754 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
21755 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
21756 SDValue NewExtract =
21757 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
21758 V.getOperand(0), NewIndex);
21759 return DAG.getBitcast(NVT, NewExtract);
21761 if (NewExtEC.isScalar() &&
21762 TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
21763 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
21764 SDValue NewExtract =
21765 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
21766 V.getOperand(0), NewIndex);
21767 return DAG.getBitcast(NVT, NewExtract);
21774 if (V.getOpcode() == ISD::CONCAT_VECTORS) {
21775 unsigned ExtNumElts = NVT.getVectorMinNumElements();
21776 EVT ConcatSrcVT = V.getOperand(0).getValueType();
21777 assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
21778 "Concat and extract subvector do not change element type");
21779 assert((ExtIdx % ExtNumElts) == 0 &&
21780 "Extract index is not a multiple of the input vector length.");
21782 unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
21783 unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;
21785 // If the concatenated source types match this extract, it's a direct
21787 // extract_subvec (concat V1, V2, ...), i --> Vi
21788 if (NVT.getVectorElementCount() == ConcatSrcVT.getVectorElementCount())
21789 return V.getOperand(ConcatOpIdx);
21791 // If the concatenated source vectors are a multiple length of this extract,
21792 // then extract a fraction of one of those source vectors directly from a
21793 // concat operand. Example:
21794 // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
21795 // v2i8 extract_subvec v8i8 Y, 6
21796 if (NVT.isFixedLengthVector() && ConcatSrcVT.isFixedLengthVector() &&
21797 ConcatSrcNumElts % ExtNumElts == 0) {
21799 unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
21800 assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
21801 "Trying to extract from >1 concat operand?");
21802 assert(NewExtIdx % ExtNumElts == 0 &&
21803 "Extract index is not a multiple of the input vector length.");
21804 SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
21805 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
21806 V.getOperand(ConcatOpIdx), NewIndexC);
21811 foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations))
21814 V = peekThroughBitcasts(V);
21816 // If the input is a build vector. Try to make a smaller build vector.
21817 if (V.getOpcode() == ISD::BUILD_VECTOR) {
21818 EVT InVT = V.getValueType();
21819 unsigned ExtractSize = NVT.getSizeInBits();
21820 unsigned EltSize = InVT.getScalarSizeInBits();
21821 // Only do this if we won't split any elements.
21822 if (ExtractSize % EltSize == 0) {
21823 unsigned NumElems = ExtractSize / EltSize;
21824 EVT EltVT = InVT.getVectorElementType();
21826 NumElems == 1 ? EltVT
21827 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
21828 if ((Level < AfterLegalizeDAG ||
21830 TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
21831 (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
21832 unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize;
21834 if (NumElems == 1) {
21835 SDValue Src = V->getOperand(IdxVal);
21836 if (EltVT != Src.getValueType())
21837 Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
21838 return DAG.getBitcast(NVT, Src);
21841 // Extract the pieces from the original build_vector.
21842 SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
21843 V->ops().slice(IdxVal, NumElems));
21844 return DAG.getBitcast(NVT, BuildVec);
21849 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
21850 // Handle only simple case where vector being inserted and vector
21851 // being extracted are of same size.
21852 EVT SmallVT = V.getOperand(1).getValueType();
21853 if (!NVT.bitsEq(SmallVT))
21857 // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
21859 // indices are equal or bit offsets are equal => V1
21860 // otherwise => (extract_subvec V1, ExtIdx)
21861 uint64_t InsIdx = V.getConstantOperandVal(2);
21862 if (InsIdx * SmallVT.getScalarSizeInBits() ==
21863 ExtIdx * NVT.getScalarSizeInBits()) {
21864 if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
21867 return DAG.getBitcast(NVT, V.getOperand(1));
21869 return DAG.getNode(
21870 ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
21871 DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
21875 if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations))
21878 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
21879 return SDValue(N, 0);
21884 /// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
21885 /// followed by concatenation. Narrow vector ops may have better performance
21886 /// than wide ops, and this can unlock further narrowing of other vector ops.
21887 /// Targets can invert this transform later if it is not profitable.
21888 static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
21889 SelectionDAG &DAG) {
21890 SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
21891 if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
21892 N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 ||
21893 !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef())
21896 // Split the wide shuffle mask into halves. Any mask element that is accessing
21897 // operand 1 is offset down to account for narrowing of the vectors.
21898 ArrayRef<int> Mask = Shuf->getMask();
21899 EVT VT = Shuf->getValueType(0);
21900 unsigned NumElts = VT.getVectorNumElements();
21901 unsigned HalfNumElts = NumElts / 2;
21902 SmallVector<int, 16> Mask0(HalfNumElts, -1);
21903 SmallVector<int, 16> Mask1(HalfNumElts, -1);
21904 for (unsigned i = 0; i != NumElts; ++i) {
21907 // If we reference the upper (undef) subvector then the element is undef.
21908 if ((Mask[i] % NumElts) >= HalfNumElts)
21910 int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
21911 if (i < HalfNumElts)
21914 Mask1[i - HalfNumElts] = M;
21917 // Ask the target if this is a valid transform.
21918 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21919 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
21921 if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) ||
21922 !TLI.isShuffleMaskLegal(Mask1, HalfVT))
21925 // shuffle (concat X, undef), (concat Y, undef), Mask -->
21926 // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
21927 SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
21929 SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
21930 SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
21931 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
21934 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
21935 // or turn a shuffle of a single concat into simpler shuffle then concat.
21936 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
21937 EVT VT = N->getValueType(0);
21938 unsigned NumElts = VT.getVectorNumElements();
21940 SDValue N0 = N->getOperand(0);
21941 SDValue N1 = N->getOperand(1);
21942 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
21943 ArrayRef<int> Mask = SVN->getMask();
21945 SmallVector<SDValue, 4> Ops;
21946 EVT ConcatVT = N0.getOperand(0).getValueType();
21947 unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
21948 unsigned NumConcats = NumElts / NumElemsPerConcat;
21950 auto IsUndefMaskElt = [](int i) { return i == -1; };
21952 // Special case: shuffle(concat(A,B)) can be more efficiently represented
21953 // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
21954 // half vector elements.
21955 if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
21956 llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat),
21958 N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0),
21960 Mask.slice(0, NumElemsPerConcat));
21961 N1 = DAG.getUNDEF(ConcatVT);
21962 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
21965 // Look at every vector that's inserted. We're looking for exact
21966 // subvector-sized copies from a concatenated vector
21967 for (unsigned I = 0; I != NumConcats; ++I) {
21968 unsigned Begin = I * NumElemsPerConcat;
21969 ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat);
21971 // Make sure we're dealing with a copy.
21972 if (llvm::all_of(SubMask, IsUndefMaskElt)) {
21973 Ops.push_back(DAG.getUNDEF(ConcatVT));
21978 for (int i = 0; i != (int)NumElemsPerConcat; ++i) {
21979 if (IsUndefMaskElt(SubMask[i]))
21981 if ((SubMask[i] % (int)NumElemsPerConcat) != i)
21983 int EltOpIdx = SubMask[i] / NumElemsPerConcat;
21984 if (0 <= OpIdx && EltOpIdx != OpIdx)
21988 assert(0 <= OpIdx && "Unknown concat_vectors op");
21990 if (OpIdx < (int)N0.getNumOperands())
21991 Ops.push_back(N0.getOperand(OpIdx));
21993 Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands()));
21996 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
21999 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
22000 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
22002 // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always
22003 // a simplification in some sense, but it isn't appropriate in general: some
22004 // BUILD_VECTORs are substantially cheaper than others. The general case
22005 // of a BUILD_VECTOR requires inserting each element individually (or
22006 // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
22007 // all constants is a single constant pool load. A BUILD_VECTOR where each
22008 // element is identical is a splat. A BUILD_VECTOR where most of the operands
22009 // are undef lowers to a small number of element insertions.
22011 // To deal with this, we currently use a bunch of mostly arbitrary heuristics.
22012 // We don't fold shuffles where one side is a non-zero constant, and we don't
22013 // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
22014 // non-constant operands. This seems to work out reasonably well in practice.
22015 static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
22017 const TargetLowering &TLI) {
22018 EVT VT = SVN->getValueType(0);
22019 unsigned NumElts = VT.getVectorNumElements();
22020 SDValue N0 = SVN->getOperand(0);
22021 SDValue N1 = SVN->getOperand(1);
22023 if (!N0->hasOneUse())
22026 // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
22027 // discussed above.
22028 if (!N1.isUndef()) {
22029 if (!N1->hasOneUse())
22032 bool N0AnyConst = isAnyConstantBuildVector(N0);
22033 bool N1AnyConst = isAnyConstantBuildVector(N1);
22034 if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
22036 if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
22040 // If both inputs are splats of the same value then we can safely merge this
22041 // to a single BUILD_VECTOR with undef elements based on the shuffle mask.
22042 bool IsSplat = false;
22043 auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
22044 auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
22046 if (SDValue Splat0 = BV0->getSplatValue())
22047 IsSplat = (Splat0 == BV1->getSplatValue());
22049 SmallVector<SDValue, 8> Ops;
22050 SmallSet<SDValue, 16> DuplicateOps;
22051 for (int M : SVN->getMask()) {
22052 SDValue Op = DAG.getUNDEF(VT.getScalarType());
22054 int Idx = M < (int)NumElts ? M : M - NumElts;
22055 SDValue &S = (M < (int)NumElts ? N0 : N1);
22056 if (S.getOpcode() == ISD::BUILD_VECTOR) {
22057 Op = S.getOperand(Idx);
22058 } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
22059 SDValue Op0 = S.getOperand(0);
22060 Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType());
22062 // Operand can't be combined - bail out.
22067 // Don't duplicate a non-constant BUILD_VECTOR operand unless we're
22068 // generating a splat; semantically, this is fine, but it's likely to
22069 // generate low-quality code if the target can't reconstruct an appropriate
22071 if (!Op.isUndef() && !isIntOrFPConstant(Op))
22072 if (!IsSplat && !DuplicateOps.insert(Op).second)
22078 // BUILD_VECTOR requires all inputs to be of the same type, find the
22079 // maximum type and extend them all.
22080 EVT SVT = VT.getScalarType();
22081 if (SVT.isInteger())
22082 for (SDValue &Op : Ops)
22083 SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
22084 if (SVT != VT.getScalarType())
22085 for (SDValue &Op : Ops)
22086 Op = Op.isUndef() ? DAG.getUNDEF(SVT)
22087 : (TLI.isZExtFree(Op.getValueType(), SVT)
22088 ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
22089 : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT));
22090 return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
22093 // Match shuffles that can be converted to any_vector_extend_in_reg.
22094 // This is often generated during legalization.
22095 // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
22096 // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
22097 static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
22099 const TargetLowering &TLI,
22100 bool LegalOperations) {
22101 EVT VT = SVN->getValueType(0);
22102 bool IsBigEndian = DAG.getDataLayout().isBigEndian();
22104 // TODO Add support for big-endian when we have a test case.
22105 if (!VT.isInteger() || IsBigEndian)
22108 unsigned NumElts = VT.getVectorNumElements();
22109 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22110 ArrayRef<int> Mask = SVN->getMask();
22111 SDValue N0 = SVN->getOperand(0);
22113 // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
22114 auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
22115 for (unsigned i = 0; i != NumElts; ++i) {
22118 if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
22125 // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
22126 // power-of-2 extensions as they are the most likely.
22127 for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
22128 // Check for non power of 2 vector sizes
22129 if (NumElts % Scale != 0)
22131 if (!isAnyExtend(Scale))
22134 EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
22135 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
22136 // Never create an illegal type. Only create unsupported operations if we
22137 // are pre-legalization.
22138 if (TLI.isTypeLegal(OutVT))
22139 if (!LegalOperations ||
22140 TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
22141 return DAG.getBitcast(VT,
22142 DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG,
22143 SDLoc(SVN), OutVT, N0));
22149 // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
22150 // each source element of a large type into the lowest elements of a smaller
22151 // destination type. This is often generated during legalization.
22152 // If the source node itself was a '*_extend_vector_inreg' node then we should
22153 // then be able to remove it.
22154 static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
22155 SelectionDAG &DAG) {
22156 EVT VT = SVN->getValueType(0);
22157 bool IsBigEndian = DAG.getDataLayout().isBigEndian();
22159 // TODO Add support for big-endian when we have a test case.
22160 if (!VT.isInteger() || IsBigEndian)
22163 SDValue N0 = peekThroughBitcasts(SVN->getOperand(0));
22165 unsigned Opcode = N0.getOpcode();
22166 if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
22167 Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
22168 Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
22171 SDValue N00 = N0.getOperand(0);
22172 ArrayRef<int> Mask = SVN->getMask();
22173 unsigned NumElts = VT.getVectorNumElements();
22174 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22175 unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
22176 unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();
22178 if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
22180 unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;
22182 // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
22183 // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
22184 // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
22185 auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
22186 for (unsigned i = 0; i != NumElts; ++i) {
22189 if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
22196 // At the moment we just handle the case where we've truncated back to the
22197 // same size as before the extension.
22198 // TODO: handle more extension/truncation cases as cases arise.
22199 if (EltSizeInBits != ExtSrcSizeInBits)
22202 // We can remove *extend_vector_inreg only if the truncation happens at
22203 // the same scale as the extension.
22204 if (isTruncate(ExtScale))
22205 return DAG.getBitcast(VT, N00);
22210 // Combine shuffles of splat-shuffles of the form:
22211 // shuffle (shuffle V, undef, splat-mask), undef, M
22212 // If splat-mask contains undef elements, we need to be careful about
22213 // introducing undef's in the folded mask which are not the result of composing
22214 // the masks of the shuffles.
22215 static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
22216 SelectionDAG &DAG) {
22217 if (!Shuf->getOperand(1).isUndef())
22220 // If the inner operand is a known splat with no undefs, just return that directly.
22221 // TODO: Create DemandedElts mask from Shuf's mask.
22222 // TODO: Allow undef elements and merge with the shuffle code below.
22223 if (DAG.isSplatValue(Shuf->getOperand(0), /*AllowUndefs*/ false))
22224 return Shuf->getOperand(0);
22226 auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
22227 if (!Splat || !Splat->isSplat())
22230 ArrayRef<int> ShufMask = Shuf->getMask();
22231 ArrayRef<int> SplatMask = Splat->getMask();
22232 assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch");
22234 // Prefer simplifying to the splat-shuffle, if possible. This is legal if
22235 // every undef mask element in the splat-shuffle has a corresponding undef
22236 // element in the user-shuffle's mask or if the composition of mask elements
22237 // would result in undef.
22238 // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
22239 // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
22240 // In this case it is not legal to simplify to the splat-shuffle because we
22241 // may be exposing the users of the shuffle an undef element at index 1
22242 // which was not there before the combine.
22243 // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
22244 // In this case the composition of masks yields SplatMask, so it's ok to
22245 // simplify to the splat-shuffle.
22246 // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
22247 // In this case the composed mask includes all undef elements of SplatMask
22248 // and in addition sets element zero to undef. It is safe to simplify to
22249 // the splat-shuffle.
22250 auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
22251 ArrayRef<int> SplatMask) {
22252 for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
22253 if (UserMask[i] != -1 && SplatMask[i] == -1 &&
22254 SplatMask[UserMask[i]] != -1)
22258 if (CanSimplifyToExistingSplat(ShufMask, SplatMask))
22259 return Shuf->getOperand(0);
22261 // Create a new shuffle with a mask that is composed of the two shuffles'
22263 SmallVector<int, 32> NewMask;
22264 for (int Idx : ShufMask)
22265 NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
22267 return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
22268 Splat->getOperand(0), Splat->getOperand(1),
22272 // Combine shuffles of bitcasts into a shuffle of the bitcast type, providing
22273 // the mask can be treated as a larger type.
22274 static SDValue combineShuffleOfBitcast(ShuffleVectorSDNode *SVN,
22276 const TargetLowering &TLI,
22277 bool LegalOperations) {
22278 SDValue Op0 = SVN->getOperand(0);
22279 SDValue Op1 = SVN->getOperand(1);
22280 EVT VT = SVN->getValueType(0);
22281 if (Op0.getOpcode() != ISD::BITCAST)
22283 EVT InVT = Op0.getOperand(0).getValueType();
22284 if (!InVT.isVector() ||
22285 (!Op1.isUndef() && (Op1.getOpcode() != ISD::BITCAST ||
22286 Op1.getOperand(0).getValueType() != InVT)))
22288 if (isAnyConstantBuildVector(Op0.getOperand(0)) &&
22289 (Op1.isUndef() || isAnyConstantBuildVector(Op1.getOperand(0))))
22292 int VTLanes = VT.getVectorNumElements();
22293 int InLanes = InVT.getVectorNumElements();
22294 if (VTLanes <= InLanes || VTLanes % InLanes != 0 ||
22295 (LegalOperations &&
22296 !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, InVT)))
22298 int Factor = VTLanes / InLanes;
22300 // Check that each group of lanes in the mask are either undef or make a valid
22301 // mask for the wider lane type.
22302 ArrayRef<int> Mask = SVN->getMask();
22303 SmallVector<int> NewMask;
22304 if (!widenShuffleMaskElts(Factor, Mask, NewMask))
22307 if (!TLI.isShuffleMaskLegal(NewMask, InVT))
22310 // Create the new shuffle with the new mask and bitcast it back to the
22313 Op0 = Op0.getOperand(0);
22314 Op1 = Op1.isUndef() ? DAG.getUNDEF(InVT) : Op1.getOperand(0);
22315 SDValue NewShuf = DAG.getVectorShuffle(InVT, DL, Op0, Op1, NewMask);
22316 return DAG.getBitcast(VT, NewShuf);
22319 /// Combine shuffle of shuffle of the form:
22320 /// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X
22321 static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,
22322 SelectionDAG &DAG) {
22323 if (!OuterShuf->getOperand(1).isUndef())
22325 auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0));
22326 if (!InnerShuf || !InnerShuf->getOperand(1).isUndef())
22329 ArrayRef<int> OuterMask = OuterShuf->getMask();
22330 ArrayRef<int> InnerMask = InnerShuf->getMask();
22331 unsigned NumElts = OuterMask.size();
22332 assert(NumElts == InnerMask.size() && "Mask length mismatch");
22333 SmallVector<int, 32> CombinedMask(NumElts, -1);
22334 int SplatIndex = -1;
22335 for (unsigned i = 0; i != NumElts; ++i) {
22336 // Undef lanes remain undef.
22337 int OuterMaskElt = OuterMask[i];
22338 if (OuterMaskElt == -1)
22341 // Peek through the shuffle masks to get the underlying source element.
22342 int InnerMaskElt = InnerMask[OuterMaskElt];
22343 if (InnerMaskElt == -1)
22346 // Initialize the splatted element.
22347 if (SplatIndex == -1)
22348 SplatIndex = InnerMaskElt;
22350 // Non-matching index - this is not a splat.
22351 if (SplatIndex != InnerMaskElt)
22354 CombinedMask[i] = InnerMaskElt;
22356 assert((all_of(CombinedMask, [](int M) { return M == -1; }) ||
22357 getSplatIndex(CombinedMask) != -1) &&
22358 "Expected a splat mask");
22360 // TODO: The transform may be a win even if the mask is not legal.
22361 EVT VT = OuterShuf->getValueType(0);
22362 assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types");
22363 if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT))
22366 return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0),
22367 InnerShuf->getOperand(1), CombinedMask);
22370 /// If the shuffle mask is taking exactly one element from the first vector
22371 /// operand and passing through all other elements from the second vector
22372 /// operand, return the index of the mask element that is choosing an element
22373 /// from the first operand. Otherwise, return -1.
22374 static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
22375 int MaskSize = Mask.size();
22376 int EltFromOp0 = -1;
22377 // TODO: This does not match if there are undef elements in the shuffle mask.
22378 // Should we ignore undefs in the shuffle mask instead? The trade-off is
22379 // removing an instruction (a shuffle), but losing the knowledge that some
22380 // vector lanes are not needed.
22381 for (int i = 0; i != MaskSize; ++i) {
22382 if (Mask[i] >= 0 && Mask[i] < MaskSize) {
22383 // We're looking for a shuffle of exactly one element from operand 0.
22384 if (EltFromOp0 != -1)
22387 } else if (Mask[i] != i + MaskSize) {
22388 // Nothing from operand 1 can change lanes.
22395 /// If a shuffle inserts exactly one element from a source vector operand into
22396 /// another vector operand and we can access the specified element as a scalar,
22397 /// then we can eliminate the shuffle.
22398 static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
22399 SelectionDAG &DAG) {
22400 // First, check if we are taking one element of a vector and shuffling that
22401 // element into another vector.
22402 ArrayRef<int> Mask = Shuf->getMask();
22403 SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
22404 SDValue Op0 = Shuf->getOperand(0);
22405 SDValue Op1 = Shuf->getOperand(1);
22406 int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
22407 if (ShufOp0Index == -1) {
22408 // Commute mask and check again.
22409 ShuffleVectorSDNode::commuteMask(CommutedMask);
22410 ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
22411 if (ShufOp0Index == -1)
22413 // Commute operands to match the commuted shuffle mask.
22414 std::swap(Op0, Op1);
22415 Mask = CommutedMask;
22418 // The shuffle inserts exactly one element from operand 0 into operand 1.
22419 // Now see if we can access that element as a scalar via a real insert element
22421 // TODO: We can try harder to locate the element as a scalar. Examples: it
22422 // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
22423 assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
22424 "Shuffle mask value must be from operand 0");
22425 if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
22428 auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
22429 if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index])
22432 // There's an existing insertelement with constant insertion index, so we
22433 // don't need to check the legality/profitability of a replacement operation
22434 // that differs at most in the constant value. The target should be able to
22435 // lower any of those in a similar way. If not, legalization will expand this
22436 // to a scalar-to-vector plus shuffle.
22438 // Note that the shuffle may move the scalar from the position that the insert
22439 // element used. Therefore, our new insert element occurs at the shuffle's
22440 // mask index value, not the insert's index value.
22441 // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
22442 SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf));
22443 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
22444 Op1, Op0.getOperand(1), NewInsIndex);
22447 /// If we have a unary shuffle of a shuffle, see if it can be folded away
22448 /// completely. This has the potential to lose undef knowledge because the first
22449 /// shuffle may not have an undef mask element where the second one does. So
22450 /// only call this after doing simplifications based on demanded elements.
22451 static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) {
22452 // shuf (shuf0 X, Y, Mask0), undef, Mask
22453 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
22454 if (!Shuf0 || !Shuf->getOperand(1).isUndef())
22457 ArrayRef<int> Mask = Shuf->getMask();
22458 ArrayRef<int> Mask0 = Shuf0->getMask();
22459 for (int i = 0, e = (int)Mask.size(); i != e; ++i) {
22460 // Ignore undef elements.
22463 assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value");
22465 // Is the element of the shuffle operand chosen by this shuffle the same as
22466 // the element chosen by the shuffle operand itself?
22467 if (Mask0[Mask[i]] != Mask0[i])
22470 // Every element of this shuffle is identical to the result of the previous
22471 // shuffle, so we can replace this value.
22472 return Shuf->getOperand(0);
22475 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
22476 EVT VT = N->getValueType(0);
22477 unsigned NumElts = VT.getVectorNumElements();
22479 SDValue N0 = N->getOperand(0);
22480 SDValue N1 = N->getOperand(1);
22482 assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
22484 // Canonicalize shuffle undef, undef -> undef
22485 if (N0.isUndef() && N1.isUndef())
22486 return DAG.getUNDEF(VT);
22488 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
22490 // Canonicalize shuffle v, v -> v, undef
22492 return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT),
22493 createUnaryMask(SVN->getMask(), NumElts));
22495 // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
22497 return DAG.getCommutedVectorShuffle(*SVN);
22499 // Remove references to rhs if it is undef
22500 if (N1.isUndef()) {
22501 bool Changed = false;
22502 SmallVector<int, 8> NewMask;
22503 for (unsigned i = 0; i != NumElts; ++i) {
22504 int Idx = SVN->getMaskElt(i);
22505 if (Idx >= (int)NumElts) {
22509 NewMask.push_back(Idx);
22512 return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
22515 if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
22518 // A shuffle of a single vector that is a splatted value can always be folded.
22519 if (SDValue V = combineShuffleOfSplatVal(SVN, DAG))
22522 if (SDValue V = formSplatFromShuffles(SVN, DAG))
22525 // If it is a splat, check if the argument vector is another splat or a
22527 if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
22528 int SplatIndex = SVN->getSplatIndex();
22529 if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
22530 TLI.isBinOp(N0.getOpcode()) && N0->getNumValues() == 1) {
22531 // splat (vector_bo L, R), Index -->
22532 // splat (scalar_bo (extelt L, Index), (extelt R, Index))
22533 SDValue L = N0.getOperand(0), R = N0.getOperand(1);
22535 EVT EltVT = VT.getScalarType();
22536 SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
22537 SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
22538 SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
22540 DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, N0->getFlags());
22541 SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
22542 SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
22543 return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
22546 // splat(scalar_to_vector(x), 0) -> build_vector(x,...,x)
22547 // splat(insert_vector_elt(v, x, c), c) -> build_vector(x,...,x)
22548 if ((!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) &&
22550 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && SplatIndex == 0)
22551 return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(0));
22553 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT)
22554 if (auto *Idx = dyn_cast<ConstantSDNode>(N0.getOperand(2)))
22555 if (Idx->getAPIntValue() == SplatIndex)
22556 return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(1));
22559 // If this is a bit convert that changes the element type of the vector but
22560 // not the number of vector elements, look through it. Be careful not to
22561 // look though conversions that change things like v4f32 to v2f64.
22562 SDNode *V = N0.getNode();
22563 if (V->getOpcode() == ISD::BITCAST) {
22564 SDValue ConvInput = V->getOperand(0);
22565 if (ConvInput.getValueType().isVector() &&
22566 ConvInput.getValueType().getVectorNumElements() == NumElts)
22567 V = ConvInput.getNode();
22570 if (V->getOpcode() == ISD::BUILD_VECTOR) {
22571 assert(V->getNumOperands() == NumElts &&
22572 "BUILD_VECTOR has wrong number of operands");
22574 bool AllSame = true;
22575 for (unsigned i = 0; i != NumElts; ++i) {
22576 if (!V->getOperand(i).isUndef()) {
22577 Base = V->getOperand(i);
22581 // Splat of <u, u, u, u>, return <u, u, u, u>
22582 if (!Base.getNode())
22584 for (unsigned i = 0; i != NumElts; ++i) {
22585 if (V->getOperand(i) != Base) {
22590 // Splat of <x, x, x, x>, return <x, x, x, x>
22594 // Canonicalize any other splat as a build_vector.
22595 SDValue Splatted = V->getOperand(SplatIndex);
22596 SmallVector<SDValue, 8> Ops(NumElts, Splatted);
22597 SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
22599 // We may have jumped through bitcasts, so the type of the
22600 // BUILD_VECTOR may not match the type of the shuffle.
22601 if (V->getValueType(0) != VT)
22602 NewBV = DAG.getBitcast(VT, NewBV);
22607 // Simplify source operands based on shuffle mask.
22608 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
22609 return SDValue(N, 0);
22611 // This is intentionally placed after demanded elements simplification because
22612 // it could eliminate knowledge of undef elements created by this shuffle.
22613 if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN))
22616 // Match shuffles that can be converted to any_vector_extend_in_reg.
22617 if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
22620 // Combine "truncate_vector_in_reg" style shuffles.
22621 if (SDValue V = combineTruncationShuffle(SVN, DAG))
22624 if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
22625 Level < AfterLegalizeVectorOps &&
22627 (N1.getOpcode() == ISD::CONCAT_VECTORS &&
22628 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
22629 if (SDValue V = partitionShuffleOfConcats(N, DAG))
22633 // A shuffle of a concat of the same narrow vector can be reduced to use
22634 // only low-half elements of a concat with undef:
22635 // shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask'
22636 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() &&
22637 N0.getNumOperands() == 2 &&
22638 N0.getOperand(0) == N0.getOperand(1)) {
22639 int HalfNumElts = (int)NumElts / 2;
22640 SmallVector<int, 8> NewMask;
22641 for (unsigned i = 0; i != NumElts; ++i) {
22642 int Idx = SVN->getMaskElt(i);
22643 if (Idx >= HalfNumElts) {
22644 assert(Idx < (int)NumElts && "Shuffle mask chooses undef op");
22645 Idx -= HalfNumElts;
22647 NewMask.push_back(Idx);
22649 if (TLI.isShuffleMaskLegal(NewMask, VT)) {
22650 SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType());
22651 SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
22652 N0.getOperand(0), UndefVec);
22653 return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask);
22657 // See if we can replace a shuffle with an insert_subvector.
22658 // e.g. v2i32 into v8i32:
22659 // shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),0,1,2,3,10,11,6,7).
22660 // --> insert_subvector(lhs,rhs1,4).
22661 if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT) &&
22662 TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) {
22663 auto ShuffleToInsert = [&](SDValue LHS, SDValue RHS, ArrayRef<int> Mask) {
22664 // Ensure RHS subvectors are legal.
22665 assert(RHS.getOpcode() == ISD::CONCAT_VECTORS && "Can't find subvectors");
22666 EVT SubVT = RHS.getOperand(0).getValueType();
22667 int NumSubVecs = RHS.getNumOperands();
22668 int NumSubElts = SubVT.getVectorNumElements();
22669 assert((NumElts % NumSubElts) == 0 && "Subvector mismatch");
22670 if (!TLI.isTypeLegal(SubVT))
22673 // Don't bother if we have an unary shuffle (matches undef + LHS elts).
22674 if (all_of(Mask, [NumElts](int M) { return M < (int)NumElts; }))
22677 // Search [NumSubElts] spans for RHS sequence.
22678 // TODO: Can we avoid nested loops to increase performance?
22679 SmallVector<int> InsertionMask(NumElts);
22680 for (int SubVec = 0; SubVec != NumSubVecs; ++SubVec) {
22681 for (int SubIdx = 0; SubIdx != (int)NumElts; SubIdx += NumSubElts) {
22682 // Reset mask to identity.
22683 std::iota(InsertionMask.begin(), InsertionMask.end(), 0);
22685 // Add subvector insertion.
22686 std::iota(InsertionMask.begin() + SubIdx,
22687 InsertionMask.begin() + SubIdx + NumSubElts,
22688 NumElts + (SubVec * NumSubElts));
22690 // See if the shuffle mask matches the reference insertion mask.
22691 bool MatchingShuffle = true;
22692 for (int i = 0; i != (int)NumElts; ++i) {
22693 int ExpectIdx = InsertionMask[i];
22694 int ActualIdx = Mask[i];
22695 if (0 <= ActualIdx && ExpectIdx != ActualIdx) {
22696 MatchingShuffle = false;
22701 if (MatchingShuffle)
22702 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, LHS,
22703 RHS.getOperand(SubVec),
22704 DAG.getVectorIdxConstant(SubIdx, SDLoc(N)));
22709 ArrayRef<int> Mask = SVN->getMask();
22710 if (N1.getOpcode() == ISD::CONCAT_VECTORS)
22711 if (SDValue InsertN1 = ShuffleToInsert(N0, N1, Mask))
22713 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
22714 SmallVector<int> CommuteMask(Mask.begin(), Mask.end());
22715 ShuffleVectorSDNode::commuteMask(CommuteMask);
22716 if (SDValue InsertN0 = ShuffleToInsert(N1, N0, CommuteMask))
22721 // If we're not performing a select/blend shuffle, see if we can convert the
22722 // shuffle into a AND node, with all the out-of-lane elements are known zero.
22723 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
22724 bool IsInLaneMask = true;
22725 ArrayRef<int> Mask = SVN->getMask();
22726 SmallVector<int, 16> ClearMask(NumElts, -1);
22727 APInt DemandedLHS = APInt::getNullValue(NumElts);
22728 APInt DemandedRHS = APInt::getNullValue(NumElts);
22729 for (int I = 0; I != (int)NumElts; ++I) {
22733 ClearMask[I] = M == I ? I : (I + NumElts);
22734 IsInLaneMask &= (M == I) || (M == (int)(I + NumElts));
22736 APInt &Demanded = M < (int)NumElts ? DemandedLHS : DemandedRHS;
22737 Demanded.setBit(M % NumElts);
22740 // TODO: Should we try to mask with N1 as well?
22741 if (!IsInLaneMask &&
22742 (!DemandedLHS.isNullValue() || !DemandedRHS.isNullValue()) &&
22743 (DemandedLHS.isNullValue() ||
22744 DAG.MaskedVectorIsZero(N0, DemandedLHS)) &&
22745 (DemandedRHS.isNullValue() ||
22746 DAG.MaskedVectorIsZero(N1, DemandedRHS))) {
22748 EVT IntVT = VT.changeVectorElementTypeToInteger();
22749 EVT IntSVT = VT.getVectorElementType().changeTypeToInteger();
22750 SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT);
22751 SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT);
22752 SmallVector<SDValue, 16> AndMask(NumElts, DAG.getUNDEF(IntSVT));
22753 for (int I = 0; I != (int)NumElts; ++I)
22755 AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt;
22757 // See if a clear mask is legal instead of going via
22758 // XformToShuffleWithZero which loses UNDEF mask elements.
22759 if (TLI.isVectorClearMaskLegal(ClearMask, IntVT))
22760 return DAG.getBitcast(
22761 VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0),
22762 DAG.getConstant(0, DL, IntVT), ClearMask));
22764 if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
22765 return DAG.getBitcast(
22766 VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0),
22767 DAG.getBuildVector(IntVT, DL, AndMask)));
22771 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
22772 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
22773 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
22774 if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
22777 // If this shuffle only has a single input that is a bitcasted shuffle,
22778 // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
22779 // back to their original types.
22780 if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
22781 N1.isUndef() && Level < AfterLegalizeVectorOps &&
22782 TLI.isTypeLegal(VT)) {
22784 SDValue BC0 = peekThroughOneUseBitcasts(N0);
22785 if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
22786 EVT SVT = VT.getScalarType();
22787 EVT InnerVT = BC0->getValueType(0);
22788 EVT InnerSVT = InnerVT.getScalarType();
22790 // Determine which shuffle works with the smaller scalar type.
22791 EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
22792 EVT ScaleSVT = ScaleVT.getScalarType();
22794 if (TLI.isTypeLegal(ScaleVT) &&
22795 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
22796 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
22797 int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
22798 int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
22800 // Scale the shuffle masks to the smaller scalar type.
22801 ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
22802 SmallVector<int, 8> InnerMask;
22803 SmallVector<int, 8> OuterMask;
22804 narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask);
22805 narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask);
22807 // Merge the shuffle masks.
22808 SmallVector<int, 8> NewMask;
22809 for (int M : OuterMask)
22810 NewMask.push_back(M < 0 ? -1 : InnerMask[M]);
22812 // Test for shuffle mask legality over both commutations.
22813 SDValue SV0 = BC0->getOperand(0);
22814 SDValue SV1 = BC0->getOperand(1);
22815 bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
22817 std::swap(SV0, SV1);
22818 ShuffleVectorSDNode::commuteMask(NewMask);
22819 LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
22823 SV0 = DAG.getBitcast(ScaleVT, SV0);
22824 SV1 = DAG.getBitcast(ScaleVT, SV1);
22825 return DAG.getBitcast(
22826 VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
22832 // Match shuffles of bitcasts, so long as the mask can be treated as the
22834 if (SDValue V = combineShuffleOfBitcast(SVN, DAG, TLI, LegalOperations))
22837 // Compute the combined shuffle mask for a shuffle with SV0 as the first
22838 // operand, and SV1 as the second operand.
22839 // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask) iff Commute = false
22840 // Merge SVN(N1, OtherSVN) -> shuffle(SV0, SV1, Mask') iff Commute = true
22841 auto MergeInnerShuffle =
22842 [NumElts, &VT](bool Commute, ShuffleVectorSDNode *SVN,
22843 ShuffleVectorSDNode *OtherSVN, SDValue N1,
22844 const TargetLowering &TLI, SDValue &SV0, SDValue &SV1,
22845 SmallVectorImpl<int> &Mask) -> bool {
22846 // Don't try to fold splats; they're likely to simplify somehow, or they
22848 if (OtherSVN->isSplat())
22851 SV0 = SV1 = SDValue();
22854 for (unsigned i = 0; i != NumElts; ++i) {
22855 int Idx = SVN->getMaskElt(i);
22857 // Propagate Undef.
22858 Mask.push_back(Idx);
22863 Idx = (Idx < (int)NumElts) ? (Idx + NumElts) : (Idx - NumElts);
22865 SDValue CurrentVec;
22866 if (Idx < (int)NumElts) {
22867 // This shuffle index refers to the inner shuffle N0. Lookup the inner
22868 // shuffle mask to identify which vector is actually referenced.
22869 Idx = OtherSVN->getMaskElt(Idx);
22871 // Propagate Undef.
22872 Mask.push_back(Idx);
22875 CurrentVec = (Idx < (int)NumElts) ? OtherSVN->getOperand(0)
22876 : OtherSVN->getOperand(1);
22878 // This shuffle index references an element within N1.
22882 // Simple case where 'CurrentVec' is UNDEF.
22883 if (CurrentVec.isUndef()) {
22884 Mask.push_back(-1);
22888 // Canonicalize the shuffle index. We don't know yet if CurrentVec
22889 // will be the first or second operand of the combined shuffle.
22890 Idx = Idx % NumElts;
22891 if (!SV0.getNode() || SV0 == CurrentVec) {
22892 // Ok. CurrentVec is the left hand side.
22893 // Update the mask accordingly.
22895 Mask.push_back(Idx);
22898 if (!SV1.getNode() || SV1 == CurrentVec) {
22899 // Ok. CurrentVec is the right hand side.
22900 // Update the mask accordingly.
22902 Mask.push_back(Idx + NumElts);
22906 // Last chance - see if the vector is another shuffle and if it
22907 // uses one of the existing candidate shuffle ops.
22908 if (auto *CurrentSVN = dyn_cast<ShuffleVectorSDNode>(CurrentVec)) {
22909 int InnerIdx = CurrentSVN->getMaskElt(Idx);
22910 if (InnerIdx < 0) {
22911 Mask.push_back(-1);
22914 SDValue InnerVec = (InnerIdx < (int)NumElts)
22915 ? CurrentSVN->getOperand(0)
22916 : CurrentSVN->getOperand(1);
22917 if (InnerVec.isUndef()) {
22918 Mask.push_back(-1);
22921 InnerIdx %= NumElts;
22922 if (InnerVec == SV0) {
22923 Mask.push_back(InnerIdx);
22926 if (InnerVec == SV1) {
22927 Mask.push_back(InnerIdx + NumElts);
22932 // Bail out if we cannot convert the shuffle pair into a single shuffle.
22936 if (llvm::all_of(Mask, [](int M) { return M < 0; }))
22939 // Avoid introducing shuffles with illegal mask.
22940 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
22941 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
22942 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
22943 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
22944 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
22945 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
22946 if (TLI.isShuffleMaskLegal(Mask, VT))
22949 std::swap(SV0, SV1);
22950 ShuffleVectorSDNode::commuteMask(Mask);
22951 return TLI.isShuffleMaskLegal(Mask, VT);
22954 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
22955 // Canonicalize shuffles according to rules:
22956 // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
22957 // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
22958 // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
22959 if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
22960 N0.getOpcode() != ISD::VECTOR_SHUFFLE) {
22961 // The incoming shuffle must be of the same type as the result of the
22962 // current shuffle.
22963 assert(N1->getOperand(0).getValueType() == VT &&
22964 "Shuffle types don't match");
22966 SDValue SV0 = N1->getOperand(0);
22967 SDValue SV1 = N1->getOperand(1);
22968 bool HasSameOp0 = N0 == SV0;
22969 bool IsSV1Undef = SV1.isUndef();
22970 if (HasSameOp0 || IsSV1Undef || N0 == SV1)
22971 // Commute the operands of this shuffle so merging below will trigger.
22972 return DAG.getCommutedVectorShuffle(*SVN);
22975 // Canonicalize splat shuffles to the RHS to improve merging below.
22976 // shuffle(splat(A,u), shuffle(C,D)) -> shuffle'(shuffle(C,D), splat(A,u))
22977 if (N0.getOpcode() == ISD::VECTOR_SHUFFLE &&
22978 N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
22979 cast<ShuffleVectorSDNode>(N0)->isSplat() &&
22980 !cast<ShuffleVectorSDNode>(N1)->isSplat()) {
22981 return DAG.getCommutedVectorShuffle(*SVN);
22984 // Try to fold according to rules:
22985 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
22986 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
22987 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
22988 // Don't try to fold shuffles with illegal type.
22989 // Only fold if this shuffle is the only user of the other shuffle.
22990 // Try matching shuffle(C,shuffle(A,B)) commutted patterns as well.
22991 for (int i = 0; i != 2; ++i) {
22992 if (N->getOperand(i).getOpcode() == ISD::VECTOR_SHUFFLE &&
22993 N->isOnlyUserOf(N->getOperand(i).getNode())) {
22994 // The incoming shuffle must be of the same type as the result of the
22995 // current shuffle.
22996 auto *OtherSV = cast<ShuffleVectorSDNode>(N->getOperand(i));
22997 assert(OtherSV->getOperand(0).getValueType() == VT &&
22998 "Shuffle types don't match");
23001 SmallVector<int, 4> Mask;
23002 if (MergeInnerShuffle(i != 0, SVN, OtherSV, N->getOperand(1 - i), TLI,
23004 // Check if all indices in Mask are Undef. In case, propagate Undef.
23005 if (llvm::all_of(Mask, [](int M) { return M < 0; }))
23006 return DAG.getUNDEF(VT);
23008 return DAG.getVectorShuffle(VT, SDLoc(N),
23009 SV0 ? SV0 : DAG.getUNDEF(VT),
23010 SV1 ? SV1 : DAG.getUNDEF(VT), Mask);
23015 // Merge shuffles through binops if we are able to merge it with at least
23016 // one other shuffles.
23017 // shuffle(bop(shuffle(x,y),shuffle(z,w)),undef)
23018 // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
23019 unsigned SrcOpcode = N0.getOpcode();
23020 if (TLI.isBinOp(SrcOpcode) && N->isOnlyUserOf(N0.getNode()) &&
23022 (SrcOpcode == N1.getOpcode() && N->isOnlyUserOf(N1.getNode())))) {
23023 // Get binop source ops, or just pass on the undef.
23024 SDValue Op00 = N0.getOperand(0);
23025 SDValue Op01 = N0.getOperand(1);
23026 SDValue Op10 = N1.isUndef() ? N1 : N1.getOperand(0);
23027 SDValue Op11 = N1.isUndef() ? N1 : N1.getOperand(1);
23028 // TODO: We might be able to relax the VT check but we don't currently
23029 // have any isBinOp() that has different result/ops VTs so play safe until
23030 // we have test coverage.
23031 if (Op00.getValueType() == VT && Op10.getValueType() == VT &&
23032 Op01.getValueType() == VT && Op11.getValueType() == VT &&
23033 (Op00.getOpcode() == ISD::VECTOR_SHUFFLE ||
23034 Op10.getOpcode() == ISD::VECTOR_SHUFFLE ||
23035 Op01.getOpcode() == ISD::VECTOR_SHUFFLE ||
23036 Op11.getOpcode() == ISD::VECTOR_SHUFFLE)) {
23037 auto CanMergeInnerShuffle = [&](SDValue &SV0, SDValue &SV1,
23038 SmallVectorImpl<int> &Mask, bool LeftOp,
23040 SDValue InnerN = Commute ? N1 : N0;
23041 SDValue Op0 = LeftOp ? Op00 : Op01;
23042 SDValue Op1 = LeftOp ? Op10 : Op11;
23044 std::swap(Op0, Op1);
23045 // Only accept the merged shuffle if we don't introduce undef elements,
23046 // or the inner shuffle already contained undef elements.
23047 auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(Op0);
23048 return SVN0 && InnerN->isOnlyUserOf(SVN0) &&
23049 MergeInnerShuffle(Commute, SVN, SVN0, Op1, TLI, SV0, SV1,
23051 (llvm::any_of(SVN0->getMask(), [](int M) { return M < 0; }) ||
23052 llvm::none_of(Mask, [](int M) { return M < 0; }));
23055 // Ensure we don't increase the number of shuffles - we must merge a
23056 // shuffle from at least one of the LHS and RHS ops.
23057 bool MergedLeft = false;
23058 SDValue LeftSV0, LeftSV1;
23059 SmallVector<int, 4> LeftMask;
23060 if (CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, false) ||
23061 CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, true)) {
23064 LeftMask.assign(SVN->getMask().begin(), SVN->getMask().end());
23065 LeftSV0 = Op00, LeftSV1 = Op10;
23068 bool MergedRight = false;
23069 SDValue RightSV0, RightSV1;
23070 SmallVector<int, 4> RightMask;
23071 if (CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, false) ||
23072 CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, true)) {
23073 MergedRight = true;
23075 RightMask.assign(SVN->getMask().begin(), SVN->getMask().end());
23076 RightSV0 = Op01, RightSV1 = Op11;
23079 if (MergedLeft || MergedRight) {
23081 SDValue LHS = DAG.getVectorShuffle(
23082 VT, DL, LeftSV0 ? LeftSV0 : DAG.getUNDEF(VT),
23083 LeftSV1 ? LeftSV1 : DAG.getUNDEF(VT), LeftMask);
23084 SDValue RHS = DAG.getVectorShuffle(
23085 VT, DL, RightSV0 ? RightSV0 : DAG.getUNDEF(VT),
23086 RightSV1 ? RightSV1 : DAG.getUNDEF(VT), RightMask);
23087 return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
23093 if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
23099 SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
23100 SDValue InVal = N->getOperand(0);
23101 EVT VT = N->getValueType(0);
23103 // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
23104 // with a VECTOR_SHUFFLE and possible truncate.
23105 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
23106 VT.isFixedLengthVector() &&
23107 InVal->getOperand(0).getValueType().isFixedLengthVector()) {
23108 SDValue InVec = InVal->getOperand(0);
23109 SDValue EltNo = InVal->getOperand(1);
23110 auto InVecT = InVec.getValueType();
23111 if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
23112 SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
23113 int Elt = C0->getZExtValue();
23115 // If we have an implict truncate do truncate here as long as it's legal.
23116 // if it's not legal, this should
23117 if (VT.getScalarType() != InVal.getValueType() &&
23118 InVal.getValueType().isScalarInteger() &&
23119 isTypeLegal(VT.getScalarType())) {
23121 DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
23122 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
23124 if (VT.getScalarType() == InVecT.getScalarType() &&
23125 VT.getVectorNumElements() <= InVecT.getVectorNumElements()) {
23126 SDValue LegalShuffle =
23127 TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec,
23128 DAG.getUNDEF(InVecT), NewMask, DAG);
23129 if (LegalShuffle) {
23130 // If the initial vector is the correct size this shuffle is a
23133 return LegalShuffle;
23134 // If not we must truncate the vector.
23135 if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
23136 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N));
23137 EVT SubVT = EVT::getVectorVT(*DAG.getContext(),
23138 InVecT.getVectorElementType(),
23139 VT.getVectorNumElements());
23140 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT,
23141 LegalShuffle, ZeroIdx);
23151 SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
23152 EVT VT = N->getValueType(0);
23153 SDValue N0 = N->getOperand(0);
23154 SDValue N1 = N->getOperand(1);
23155 SDValue N2 = N->getOperand(2);
23156 uint64_t InsIdx = N->getConstantOperandVal(2);
23158 // If inserting an UNDEF, just return the original vector.
23162 // If this is an insert of an extracted vector into an undef vector, we can
23163 // just use the input to the extract.
23164 if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23165 N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
23166 return N1.getOperand(0);
23168 // Simplify scalar inserts into an undef vector:
23169 // insert_subvector undef, (splat X), N2 -> splat X
23170 if (N0.isUndef() && N1.getOpcode() == ISD::SPLAT_VECTOR)
23171 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, N1.getOperand(0));
23173 // If we are inserting a bitcast value into an undef, with the same
23174 // number of elements, just use the bitcast input of the extract.
23175 // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
23176 // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
23177 if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
23178 N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23179 N1.getOperand(0).getOperand(1) == N2 &&
23180 N1.getOperand(0).getOperand(0).getValueType().getVectorElementCount() ==
23181 VT.getVectorElementCount() &&
23182 N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
23183 VT.getSizeInBits()) {
23184 return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
23187 // If both N1 and N2 are bitcast values on which insert_subvector
23188 // would makes sense, pull the bitcast through.
23189 // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
23190 // BITCAST (INSERT_SUBVECTOR N0 N1 N2)
23191 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
23192 SDValue CN0 = N0.getOperand(0);
23193 SDValue CN1 = N1.getOperand(0);
23194 EVT CN0VT = CN0.getValueType();
23195 EVT CN1VT = CN1.getValueType();
23196 if (CN0VT.isVector() && CN1VT.isVector() &&
23197 CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
23198 CN0VT.getVectorElementCount() == VT.getVectorElementCount()) {
23199 SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
23200 CN0.getValueType(), CN0, CN1, N2);
23201 return DAG.getBitcast(VT, NewINSERT);
23205 // Combine INSERT_SUBVECTORs where we are inserting to the same index.
23206 // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
23207 // --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
23208 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
23209 N0.getOperand(1).getValueType() == N1.getValueType() &&
23210 N0.getOperand(2) == N2)
23211 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
23214 // Eliminate an intermediate insert into an undef vector:
23215 // insert_subvector undef, (insert_subvector undef, X, 0), N2 -->
23216 // insert_subvector undef, X, N2
23217 if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR &&
23218 N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2)))
23219 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0,
23220 N1.getOperand(1), N2);
23222 // Push subvector bitcasts to the output, adjusting the index as we go.
23223 // insert_subvector(bitcast(v), bitcast(s), c1)
23224 // -> bitcast(insert_subvector(v, s, c2))
23225 if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) &&
23226 N1.getOpcode() == ISD::BITCAST) {
23227 SDValue N0Src = peekThroughBitcasts(N0);
23228 SDValue N1Src = peekThroughBitcasts(N1);
23229 EVT N0SrcSVT = N0Src.getValueType().getScalarType();
23230 EVT N1SrcSVT = N1Src.getValueType().getScalarType();
23231 if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) &&
23232 N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) {
23236 LLVMContext &Ctx = *DAG.getContext();
23237 ElementCount NumElts = VT.getVectorElementCount();
23238 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23239 if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {
23240 unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();
23241 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale);
23242 NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);
23243 } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {
23244 unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;
23245 if (NumElts.isKnownMultipleOf(Scale) && (InsIdx % Scale) == 0) {
23246 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT,
23247 NumElts.divideCoefficientBy(Scale));
23248 NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);
23251 if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) {
23252 SDValue Res = DAG.getBitcast(NewVT, N0Src);
23253 Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx);
23254 return DAG.getBitcast(VT, Res);
23259 // Canonicalize insert_subvector dag nodes.
23261 // (insert_subvector (insert_subvector A, Idx0), Idx1)
23262 // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
23263 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
23264 N1.getValueType() == N0.getOperand(1).getValueType()) {
23265 unsigned OtherIdx = N0.getConstantOperandVal(2);
23266 if (InsIdx < OtherIdx) {
23268 SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
23269 N0.getOperand(0), N1, N2);
23270 AddToWorklist(NewOp.getNode());
23271 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
23272 VT, NewOp, N0.getOperand(1), N0.getOperand(2));
23276 // If the input vector is a concatenation, and the insert replaces
23277 // one of the pieces, we can optimize into a single concat_vectors.
23278 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
23279 N0.getOperand(0).getValueType() == N1.getValueType() &&
23280 N0.getOperand(0).getValueType().isScalableVector() ==
23281 N1.getValueType().isScalableVector()) {
23282 unsigned Factor = N1.getValueType().getVectorMinNumElements();
23283 SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
23284 Ops[InsIdx / Factor] = N1;
23285 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
23288 // Simplify source operands based on insertion.
23289 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
23290 return SDValue(N, 0);
23295 SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
23296 SDValue N0 = N->getOperand(0);
23298 // fold (fp_to_fp16 (fp16_to_fp op)) -> op
23299 if (N0->getOpcode() == ISD::FP16_TO_FP)
23300 return N0->getOperand(0);
23305 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
23306 SDValue N0 = N->getOperand(0);
23308 // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
23309 if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) {
23310 ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
23311 if (AndConst && AndConst->getAPIntValue() == 0xffff) {
23312 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
23320 SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
23321 SDValue N0 = N->getOperand(0);
23323 // fold (fp_to_bf16 (bf16_to_fp op)) -> op
23324 if (N0->getOpcode() == ISD::BF16_TO_FP)
23325 return N0->getOperand(0);
23330 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
23331 SDValue N0 = N->getOperand(0);
23332 EVT VT = N0.getValueType();
23333 unsigned Opcode = N->getOpcode();
23335 // VECREDUCE over 1-element vector is just an extract.
23336 if (VT.getVectorElementCount().isScalar()) {
23339 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
23340 DAG.getVectorIdxConstant(0, dl));
23341 if (Res.getValueType() != N->getValueType(0))
23342 Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res);
23346 // On an boolean vector an and/or reduction is the same as a umin/umax
23347 // reduction. Convert them if the latter is legal while the former isn't.
23348 if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) {
23349 unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND
23350 ? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX;
23351 if (!TLI.isOperationLegalOrCustom(Opcode, VT) &&
23352 TLI.isOperationLegalOrCustom(NewOpcode, VT) &&
23353 DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits())
23354 return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0);
23357 // vecreduce_or(insert_subvector(zero or undef, val)) -> vecreduce_or(val)
23358 // vecreduce_and(insert_subvector(ones or undef, val)) -> vecreduce_and(val)
23359 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
23360 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
23361 SDValue Vec = N0.getOperand(0);
23362 SDValue Subvec = N0.getOperand(1);
23363 if ((Opcode == ISD::VECREDUCE_OR &&
23364 (N0.getOperand(0).isUndef() || isNullOrNullSplat(Vec))) ||
23365 (Opcode == ISD::VECREDUCE_AND &&
23366 (N0.getOperand(0).isUndef() || isAllOnesOrAllOnesSplat(Vec))))
23367 return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), Subvec);
23373 SDValue DAGCombiner::visitVPOp(SDNode *N) {
23374 // VP operations in which all vector elements are disabled - either by
23375 // determining that the mask is all false or that the EVL is 0 - can be
23377 bool AreAllEltsDisabled = false;
23378 if (auto EVLIdx = ISD::getVPExplicitVectorLengthIdx(N->getOpcode()))
23379 AreAllEltsDisabled |= isNullConstant(N->getOperand(*EVLIdx));
23380 if (auto MaskIdx = ISD::getVPMaskIdx(N->getOpcode()))
23381 AreAllEltsDisabled |=
23382 ISD::isConstantSplatVectorAllZeros(N->getOperand(*MaskIdx).getNode());
23384 // This is the only generic VP combine we support for now.
23385 if (!AreAllEltsDisabled)
23388 // Binary operations can be replaced by UNDEF.
23389 if (ISD::isVPBinaryOp(N->getOpcode()))
23390 return DAG.getUNDEF(N->getValueType(0));
23392 // VP Memory operations can be replaced by either the chain (stores) or the
23393 // chain + undef (loads).
23394 if (const auto *MemSD = dyn_cast<MemSDNode>(N)) {
23395 if (MemSD->writeMem())
23396 return MemSD->getChain();
23397 return CombineTo(N, DAG.getUNDEF(N->getValueType(0)), MemSD->getChain());
23400 // Reduction operations return the start operand when no elements are active.
23401 if (ISD::isVPReduction(N->getOpcode()))
23402 return N->getOperand(0);
23407 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
23408 /// with the destination vector and a zero vector.
23409 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
23410 /// vector_shuffle V, Zero, <0, 4, 2, 4>
23411 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
23412 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
23414 EVT VT = N->getValueType(0);
23415 SDValue LHS = N->getOperand(0);
23416 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
23419 // Make sure we're not running after operation legalization where it
23420 // may have custom lowered the vector shuffles.
23421 if (LegalOperations)
23424 if (RHS.getOpcode() != ISD::BUILD_VECTOR)
23427 EVT RVT = RHS.getValueType();
23428 unsigned NumElts = RHS.getNumOperands();
23430 // Attempt to create a valid clear mask, splitting the mask into
23431 // sub elements and checking to see if each is
23432 // all zeros or all ones - suitable for shuffle masking.
23433 auto BuildClearMask = [&](int Split) {
23434 int NumSubElts = NumElts * Split;
23435 int NumSubBits = RVT.getScalarSizeInBits() / Split;
23437 SmallVector<int, 8> Indices;
23438 for (int i = 0; i != NumSubElts; ++i) {
23439 int EltIdx = i / Split;
23440 int SubIdx = i % Split;
23441 SDValue Elt = RHS.getOperand(EltIdx);
23442 // X & undef --> 0 (not undef). So this lane must be converted to choose
23443 // from the zero constant vector (same as if the element had all 0-bits).
23444 if (Elt.isUndef()) {
23445 Indices.push_back(i + NumSubElts);
23450 if (isa<ConstantSDNode>(Elt))
23451 Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
23452 else if (isa<ConstantFPSDNode>(Elt))
23453 Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
23457 // Extract the sub element from the constant bit mask.
23458 if (DAG.getDataLayout().isBigEndian())
23459 Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits);
23461 Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits);
23463 if (Bits.isAllOnes())
23464 Indices.push_back(i);
23465 else if (Bits == 0)
23466 Indices.push_back(i + NumSubElts);
23471 // Let's see if the target supports this vector_shuffle.
23472 EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
23473 EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
23474 if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
23477 SDValue Zero = DAG.getConstant(0, DL, ClearVT);
23478 return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
23479 DAG.getBitcast(ClearVT, LHS),
23483 // Determine maximum split level (byte level masking).
23485 if (RVT.getScalarSizeInBits() % 8 == 0)
23486 MaxSplit = RVT.getScalarSizeInBits() / 8;
23488 for (int Split = 1; Split <= MaxSplit; ++Split)
23489 if (RVT.getScalarSizeInBits() % Split == 0)
23490 if (SDValue S = BuildClearMask(Split))
23496 /// If a vector binop is performed on splat values, it may be profitable to
23497 /// extract, scalarize, and insert/splat.
23498 static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
23500 SDValue N0 = N->getOperand(0);
23501 SDValue N1 = N->getOperand(1);
23502 unsigned Opcode = N->getOpcode();
23503 EVT VT = N->getValueType(0);
23504 EVT EltVT = VT.getVectorElementType();
23505 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23507 // TODO: Remove/replace the extract cost check? If the elements are available
23508 // as scalars, then there may be no extract cost. Should we ask if
23509 // inserting a scalar back into a vector is cheap instead?
23510 int Index0, Index1;
23511 SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
23512 SDValue Src1 = DAG.getSplatSourceVector(N1, Index1);
23513 // Extract element from splat_vector should be free.
23514 // TODO: use DAG.isSplatValue instead?
23515 bool IsBothSplatVector = N0.getOpcode() == ISD::SPLAT_VECTOR &&
23516 N1.getOpcode() == ISD::SPLAT_VECTOR;
23517 if (!Src0 || !Src1 || Index0 != Index1 ||
23518 Src0.getValueType().getVectorElementType() != EltVT ||
23519 Src1.getValueType().getVectorElementType() != EltVT ||
23520 !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) ||
23521 !TLI.isOperationLegalOrCustom(Opcode, EltVT))
23524 SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
23525 SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
23526 SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
23527 SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
23529 // If all lanes but 1 are undefined, no need to splat the scalar result.
23530 // TODO: Keep track of undefs and use that info in the general case.
23531 if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
23532 count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
23533 count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
23534 // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
23535 // build_vec ..undef, (bo X, Y), undef...
23536 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
23537 Ops[Index0] = ScalarBO;
23538 return DAG.getBuildVector(VT, DL, Ops);
23541 // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
23542 if (VT.isScalableVector())
23543 return DAG.getSplatVector(VT, DL, ScalarBO);
23544 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
23545 return DAG.getBuildVector(VT, DL, Ops);
23548 /// Visit a binary vector operation, like ADD.
23549 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N, const SDLoc &DL) {
23550 EVT VT = N->getValueType(0);
23551 assert(VT.isVector() && "SimplifyVBinOp only works on vectors!");
23553 SDValue LHS = N->getOperand(0);
23554 SDValue RHS = N->getOperand(1);
23555 unsigned Opcode = N->getOpcode();
23556 SDNodeFlags Flags = N->getFlags();
23558 // Move unary shuffles with identical masks after a vector binop:
23559 // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask))
23560 // --> shuffle (VBinOp A, B), Undef, Mask
23561 // This does not require type legality checks because we are creating the
23562 // same types of operations that are in the original sequence. We do have to
23563 // restrict ops like integer div that have immediate UB (eg, div-by-zero)
23564 // though. This code is adapted from the identical transform in instcombine.
23565 if (Opcode != ISD::UDIV && Opcode != ISD::SDIV &&
23566 Opcode != ISD::UREM && Opcode != ISD::SREM &&
23567 Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) {
23568 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
23569 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
23570 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
23571 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
23572 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
23573 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
23574 RHS.getOperand(0), Flags);
23575 SDValue UndefV = LHS.getOperand(1);
23576 return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
23579 // Try to sink a splat shuffle after a binop with a uniform constant.
23580 // This is limited to cases where neither the shuffle nor the constant have
23581 // undefined elements because that could be poison-unsafe or inhibit
23582 // demanded elements analysis. It is further limited to not change a splat
23583 // of an inserted scalar because that may be optimized better by
23584 // load-folding or other target-specific behaviors.
23585 if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) &&
23586 Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
23587 Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
23588 // binop (splat X), (splat C) --> splat (binop X, C)
23589 SDValue X = Shuf0->getOperand(0);
23590 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
23591 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
23594 if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) &&
23595 Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
23596 Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
23597 // binop (splat C), (splat X) --> splat (binop C, X)
23598 SDValue X = Shuf1->getOperand(0);
23599 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
23600 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
23605 // The following pattern is likely to emerge with vector reduction ops. Moving
23606 // the binary operation ahead of insertion may allow using a narrower vector
23607 // instruction that has better performance than the wide version of the op:
23608 // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
23609 if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
23610 RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
23611 LHS.getOperand(2) == RHS.getOperand(2) &&
23612 (LHS.hasOneUse() || RHS.hasOneUse())) {
23613 SDValue X = LHS.getOperand(1);
23614 SDValue Y = RHS.getOperand(1);
23615 SDValue Z = LHS.getOperand(2);
23616 EVT NarrowVT = X.getValueType();
23617 if (NarrowVT == Y.getValueType() &&
23618 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT,
23619 LegalOperations)) {
23620 // (binop undef, undef) may not return undef, so compute that result.
23622 DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));
23623 SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
23624 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
23628 // Make sure all but the first op are undef or constant.
23629 auto ConcatWithConstantOrUndef = [](SDValue Concat) {
23630 return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
23631 all_of(drop_begin(Concat->ops()), [](const SDValue &Op) {
23632 return Op.isUndef() ||
23633 ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
23637 // The following pattern is likely to emerge with vector reduction ops. Moving
23638 // the binary operation ahead of the concat may allow using a narrower vector
23639 // instruction that has better performance than the wide version of the op:
23640 // VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
23641 // concat (VBinOp X, Y), VecC
23642 if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
23643 (LHS.hasOneUse() || RHS.hasOneUse())) {
23644 EVT NarrowVT = LHS.getOperand(0).getValueType();
23645 if (NarrowVT == RHS.getOperand(0).getValueType() &&
23646 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
23647 unsigned NumOperands = LHS.getNumOperands();
23648 SmallVector<SDValue, 4> ConcatOps;
23649 for (unsigned i = 0; i != NumOperands; ++i) {
23650 // This constant fold for operands 1 and up.
23651 ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
23652 RHS.getOperand(i)));
23655 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
23659 if (SDValue V = scalarizeBinOpOfSplats(N, DAG, DL))
23665 SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
23667 assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
23669 SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
23670 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23672 // If we got a simplified select_cc node back from SimplifySelectCC, then
23673 // break it down into a new SETCC node, and a new SELECT node, and then return
23674 // the SELECT node, since we were called with a SELECT node.
23675 if (SCC.getNode()) {
23676 // Check to see if we got a select_cc back (to turn into setcc/select).
23677 // Otherwise, just return whatever node we got back, like fabs.
23678 if (SCC.getOpcode() == ISD::SELECT_CC) {
23679 const SDNodeFlags Flags = N0->getFlags();
23680 SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
23682 SCC.getOperand(0), SCC.getOperand(1),
23683 SCC.getOperand(4), Flags);
23684 AddToWorklist(SETCC.getNode());
23685 SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
23686 SCC.getOperand(2), SCC.getOperand(3));
23687 SelectNode->setFlags(Flags);
23696 /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
23697 /// being selected between, see if we can simplify the select. Callers of this
23698 /// should assume that TheSelect is deleted if this returns true. As such, they
23699 /// should return the appropriate thing (e.g. the node) back to the top-level of
23700 /// the DAG combiner loop to avoid it being looked at.
23701 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
23703 // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
23704 // The select + setcc is redundant, because fsqrt returns NaN for X < 0.
23705 if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
23706 if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
23707 // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
23708 SDValue Sqrt = RHS;
23711 const ConstantFPSDNode *Zero = nullptr;
23713 if (TheSelect->getOpcode() == ISD::SELECT_CC) {
23714 CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
23715 CmpLHS = TheSelect->getOperand(0);
23716 Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
23718 // SELECT or VSELECT
23719 SDValue Cmp = TheSelect->getOperand(0);
23720 if (Cmp.getOpcode() == ISD::SETCC) {
23721 CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
23722 CmpLHS = Cmp.getOperand(0);
23723 Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
23726 if (Zero && Zero->isZero() &&
23727 Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT ||
23728 CC == ISD::SETULT || CC == ISD::SETLT)) {
23729 // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
23730 CombineTo(TheSelect, Sqrt);
23735 // Cannot simplify select with vector condition
23736 if (TheSelect->getOperand(0).getValueType().isVector()) return false;
23738 // If this is a select from two identical things, try to pull the operation
23739 // through the select.
23740 if (LHS.getOpcode() != RHS.getOpcode() ||
23741 !LHS.hasOneUse() || !RHS.hasOneUse())
23744 // If this is a load and the token chain is identical, replace the select
23745 // of two loads with a load through a select of the address to load from.
23746 // This triggers in things like "select bool X, 10.0, 123.0" after the FP
23747 // constants have been dropped into the constant pool.
23748 if (LHS.getOpcode() == ISD::LOAD) {
23749 LoadSDNode *LLD = cast<LoadSDNode>(LHS);
23750 LoadSDNode *RLD = cast<LoadSDNode>(RHS);
23752 // Token chains must be identical.
23753 if (LHS.getOperand(0) != RHS.getOperand(0) ||
23754 // Do not let this transformation reduce the number of volatile loads.
23755 // Be conservative for atomics for the moment
23756 // TODO: This does appear to be legal for unordered atomics (see D66309)
23757 !LLD->isSimple() || !RLD->isSimple() ||
23758 // FIXME: If either is a pre/post inc/dec load,
23759 // we'd need to split out the address adjustment.
23760 LLD->isIndexed() || RLD->isIndexed() ||
23761 // If this is an EXTLOAD, the VT's must match.
23762 LLD->getMemoryVT() != RLD->getMemoryVT() ||
23763 // If this is an EXTLOAD, the kind of extension must match.
23764 (LLD->getExtensionType() != RLD->getExtensionType() &&
23765 // The only exception is if one of the extensions is anyext.
23766 LLD->getExtensionType() != ISD::EXTLOAD &&
23767 RLD->getExtensionType() != ISD::EXTLOAD) ||
23768 // FIXME: this discards src value information. This is
23769 // over-conservative. It would be beneficial to be able to remember
23770 // both potential memory locations. Since we are discarding
23771 // src value info, don't do the transformation if the memory
23772 // locations are not in the default address space.
23773 LLD->getPointerInfo().getAddrSpace() != 0 ||
23774 RLD->getPointerInfo().getAddrSpace() != 0 ||
23775 // We can't produce a CMOV of a TargetFrameIndex since we won't
23776 // generate the address generation required.
23777 LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
23778 RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
23779 !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
23780 LLD->getBasePtr().getValueType()))
23783 // The loads must not depend on one another.
23784 if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD))
23787 // Check that the select condition doesn't reach either load. If so,
23788 // folding this will induce a cycle into the DAG. If not, this is safe to
23789 // xform, so create a select of the addresses.
23791 SmallPtrSet<const SDNode *, 32> Visited;
23792 SmallVector<const SDNode *, 16> Worklist;
23794 // Always fail if LLD and RLD are not independent. TheSelect is a
23795 // predecessor to all Nodes in question so we need not search past it.
23797 Visited.insert(TheSelect);
23798 Worklist.push_back(LLD);
23799 Worklist.push_back(RLD);
23801 if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) ||
23802 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))
23806 if (TheSelect->getOpcode() == ISD::SELECT) {
23807 // We cannot do this optimization if any pair of {RLD, LLD} is a
23808 // predecessor to {RLD, LLD, CondNode}. As we've already compared the
23809 // Loads, we only need to check if CondNode is a successor to one of the
23810 // loads. We can further avoid this if there's no use of their chain
23812 SDNode *CondNode = TheSelect->getOperand(0).getNode();
23813 Worklist.push_back(CondNode);
23815 if ((LLD->hasAnyUseOfValue(1) &&
23816 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
23817 (RLD->hasAnyUseOfValue(1) &&
23818 SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
23821 Addr = DAG.getSelect(SDLoc(TheSelect),
23822 LLD->getBasePtr().getValueType(),
23823 TheSelect->getOperand(0), LLD->getBasePtr(),
23824 RLD->getBasePtr());
23825 } else { // Otherwise SELECT_CC
23826 // We cannot do this optimization if any pair of {RLD, LLD} is a
23827 // predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared
23828 // the Loads, we only need to check if CondLHS/CondRHS is a successor to
23829 // one of the loads. We can further avoid this if there's no use of their
23832 SDNode *CondLHS = TheSelect->getOperand(0).getNode();
23833 SDNode *CondRHS = TheSelect->getOperand(1).getNode();
23834 Worklist.push_back(CondLHS);
23835 Worklist.push_back(CondRHS);
23837 if ((LLD->hasAnyUseOfValue(1) &&
23838 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
23839 (RLD->hasAnyUseOfValue(1) &&
23840 SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
23843 Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
23844 LLD->getBasePtr().getValueType(),
23845 TheSelect->getOperand(0),
23846 TheSelect->getOperand(1),
23847 LLD->getBasePtr(), RLD->getBasePtr(),
23848 TheSelect->getOperand(4));
23852 // It is safe to replace the two loads if they have different alignments,
23853 // but the new load must be the minimum (most restrictive) alignment of the
23855 Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
23856 MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
23857 if (!RLD->isInvariant())
23858 MMOFlags &= ~MachineMemOperand::MOInvariant;
23859 if (!RLD->isDereferenceable())
23860 MMOFlags &= ~MachineMemOperand::MODereferenceable;
23861 if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
23862 // FIXME: Discards pointer and AA info.
23863 Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
23864 LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
23867 // FIXME: Discards pointer and AA info.
23868 Load = DAG.getExtLoad(
23869 LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
23870 : LLD->getExtensionType(),
23871 SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
23872 MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
23875 // Users of the select now use the result of the load.
23876 CombineTo(TheSelect, Load);
23878 // Users of the old loads now use the new load's chain. We know the
23879 // old-load value is dead now.
23880 CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
23881 CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
23888 /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
23890 SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
23891 SDValue N1, SDValue N2, SDValue N3,
23892 ISD::CondCode CC) {
23893 // If this is a select where the false operand is zero and the compare is a
23894 // check of the sign bit, see if we can perform the "gzip trick":
23895 // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
23896 // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
23897 EVT XType = N0.getValueType();
23898 EVT AType = N2.getValueType();
23899 if (!isNullConstant(N3) || !XType.bitsGE(AType))
23902 // If the comparison is testing for a positive value, we have to invert
23903 // the sign bit mask, so only do that transform if the target has a bitwise
23904 // 'and not' instruction (the invert is free).
23905 if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
23906 // (X > -1) ? A : 0
23907 // (X > 0) ? X : 0 <-- This is canonical signed max.
23908 if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2)))
23910 } else if (CC == ISD::SETLT) {
23912 // (X < 1) ? X : 0 <-- This is un-canonicalized signed min.
23913 if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2)))
23919 // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
23921 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
23922 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
23923 if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
23924 unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
23925 if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) {
23926 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
23927 SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
23928 AddToWorklist(Shift.getNode());
23930 if (XType.bitsGT(AType)) {
23931 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
23932 AddToWorklist(Shift.getNode());
23935 if (CC == ISD::SETGT)
23936 Shift = DAG.getNOT(DL, Shift, AType);
23938 return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
23942 unsigned ShCt = XType.getSizeInBits() - 1;
23943 if (TLI.shouldAvoidTransformToShift(XType, ShCt))
23946 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
23947 SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
23948 AddToWorklist(Shift.getNode());
23950 if (XType.bitsGT(AType)) {
23951 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
23952 AddToWorklist(Shift.getNode());
23955 if (CC == ISD::SETGT)
23956 Shift = DAG.getNOT(DL, Shift, AType);
23958 return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
23961 // Fold select(cc, binop(), binop()) -> binop(select(), select()) etc.
23962 SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
23963 SDValue N0 = N->getOperand(0);
23964 SDValue N1 = N->getOperand(1);
23965 SDValue N2 = N->getOperand(2);
23966 EVT VT = N->getValueType(0);
23969 unsigned BinOpc = N1.getOpcode();
23970 if (!TLI.isBinOp(BinOpc) || (N2.getOpcode() != BinOpc))
23973 // The use checks are intentionally on SDNode because we may be dealing
23974 // with opcodes that produce more than one SDValue.
23975 // TODO: Do we really need to check N0 (the condition operand of the select)?
23976 // But removing that clause could cause an infinite loop...
23977 if (!N0->hasOneUse() || !N1->hasOneUse() || !N2->hasOneUse())
23980 // Binops may include opcodes that return multiple values, so all values
23981 // must be created/propagated from the newly created binops below.
23982 SDVTList OpVTs = N1->getVTList();
23984 // Fold select(cond, binop(x, y), binop(z, y))
23985 // --> binop(select(cond, x, z), y)
23986 if (N1.getOperand(1) == N2.getOperand(1)) {
23988 DAG.getSelect(DL, VT, N0, N1.getOperand(0), N2.getOperand(0));
23989 SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, NewSel, N1.getOperand(1));
23990 NewBinOp->setFlags(N1->getFlags());
23991 NewBinOp->intersectFlagsWith(N2->getFlags());
23995 // Fold select(cond, binop(x, y), binop(x, z))
23996 // --> binop(x, select(cond, y, z))
23997 // Second op VT might be different (e.g. shift amount type)
23998 if (N1.getOperand(0) == N2.getOperand(0) &&
23999 VT == N1.getOperand(1).getValueType() &&
24000 VT == N2.getOperand(1).getValueType()) {
24002 DAG.getSelect(DL, VT, N0, N1.getOperand(1), N2.getOperand(1));
24003 SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, N1.getOperand(0), NewSel);
24004 NewBinOp->setFlags(N1->getFlags());
24005 NewBinOp->intersectFlagsWith(N2->getFlags());
24009 // TODO: Handle isCommutativeBinOp patterns as well?
24013 // Transform (fneg/fabs (bitconvert x)) to avoid loading constant pool values.
24014 SDValue DAGCombiner::foldSignChangeInBitcast(SDNode *N) {
24015 SDValue N0 = N->getOperand(0);
24016 EVT VT = N->getValueType(0);
24017 bool IsFabs = N->getOpcode() == ISD::FABS;
24018 bool IsFree = IsFabs ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
24020 if (IsFree || N0.getOpcode() != ISD::BITCAST || !N0.hasOneUse())
24023 SDValue Int = N0.getOperand(0);
24024 EVT IntVT = Int.getValueType();
24026 // The operand to cast should be integer.
24027 if (!IntVT.isInteger() || IntVT.isVector())
24030 // (fneg (bitconvert x)) -> (bitconvert (xor x sign))
24031 // (fabs (bitconvert x)) -> (bitconvert (and x ~sign))
24033 if (N0.getValueType().isVector()) {
24034 // For vector, create a sign mask (0x80...) or its inverse (for fabs,
24035 // 0x7f...) per element and splat it.
24036 SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
24038 SignMask = ~SignMask;
24039 SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
24041 // For scalar, just use the sign mask (0x80... or the inverse, 0x7f...)
24042 SignMask = APInt::getSignMask(IntVT.getSizeInBits());
24044 SignMask = ~SignMask;
24047 Int = DAG.getNode(IsFabs ? ISD::AND : ISD::XOR, DL, IntVT, Int,
24048 DAG.getConstant(SignMask, DL, IntVT));
24049 AddToWorklist(Int.getNode());
24050 return DAG.getBitcast(VT, Int);
24053 /// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
24054 /// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
24055 /// in it. This may be a win when the constant is not otherwise available
24056 /// because it replaces two constant pool loads with one.
24057 SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
24058 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
24059 ISD::CondCode CC) {
24060 if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType()))
24063 // If we are before legalize types, we want the other legalization to happen
24064 // first (for example, to avoid messing with soft float).
24065 auto *TV = dyn_cast<ConstantFPSDNode>(N2);
24066 auto *FV = dyn_cast<ConstantFPSDNode>(N3);
24067 EVT VT = N2.getValueType();
24068 if (!TV || !FV || !TLI.isTypeLegal(VT))
24071 // If a constant can be materialized without loads, this does not make sense.
24072 if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal ||
24073 TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) ||
24074 TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize))
24077 // If both constants have multiple uses, then we won't need to do an extra
24078 // load. The values are likely around in registers for other users.
24079 if (!TV->hasOneUse() && !FV->hasOneUse())
24082 Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()),
24083 const_cast<ConstantFP*>(TV->getConstantFPValue()) };
24084 Type *FPTy = Elts[0]->getType();
24085 const DataLayout &TD = DAG.getDataLayout();
24087 // Create a ConstantArray of the two constants.
24088 Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
24089 SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
24090 TD.getPrefTypeAlign(FPTy));
24091 Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
24093 // Get offsets to the 0 and 1 elements of the array, so we can select between
24095 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24096 unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
24097 SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
24099 DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC);
24100 AddToWorklist(Cond.getNode());
24101 SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero);
24102 AddToWorklist(CstOffset.getNode());
24103 CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset);
24104 AddToWorklist(CPIdx.getNode());
24105 return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
24106 MachinePointerInfo::getConstantPool(
24107 DAG.getMachineFunction()), Alignment);
24110 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3
24111 /// where 'cond' is the comparison specified by CC.
24112 SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
24113 SDValue N2, SDValue N3, ISD::CondCode CC,
24114 bool NotExtCompare) {
24115 // (x ? y : y) -> y.
24116 if (N2 == N3) return N2;
24118 EVT CmpOpVT = N0.getValueType();
24119 EVT CmpResVT = getSetCCResultType(CmpOpVT);
24120 EVT VT = N2.getValueType();
24121 auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
24122 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
24123 auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
24125 // Determine if the condition we're dealing with is constant.
24126 if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) {
24127 AddToWorklist(SCC.getNode());
24128 if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) {
24129 // fold select_cc true, x, y -> x
24130 // fold select_cc false, x, y -> y
24131 return !(SCCC->isZero()) ? N2 : N3;
24136 convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC))
24139 if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
24142 // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
24143 // where y is has a single bit set.
24144 // A plaintext description would be, we can turn the SELECT_CC into an AND
24145 // when the condition can be materialized as an all-ones register. Any
24146 // single bit-test can be materialized as an all-ones register with
24147 // shift-left and shift-right-arith.
24148 if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
24149 N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
24150 SDValue AndLHS = N0->getOperand(0);
24151 auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
24152 if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
24153 // Shift the tested bit over the sign bit.
24154 const APInt &AndMask = ConstAndRHS->getAPIntValue();
24155 unsigned ShCt = AndMask.getBitWidth() - 1;
24156 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
24158 DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
24159 getShiftAmountTy(AndLHS.getValueType()));
24160 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
24162 // Now arithmetic right shift it all the way over, so the result is
24163 // either all-ones, or zero.
24165 DAG.getConstant(ShCt, SDLoc(Shl),
24166 getShiftAmountTy(Shl.getValueType()));
24167 SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
24169 return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
24174 // fold select C, 16, 0 -> shl C, 4
24175 bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2();
24176 bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2();
24178 if ((Fold || Swap) &&
24179 TLI.getBooleanContents(CmpOpVT) ==
24180 TargetLowering::ZeroOrOneBooleanContent &&
24181 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) {
24184 CC = ISD::getSetCCInverse(CC, CmpOpVT);
24185 std::swap(N2C, N3C);
24188 // If the caller doesn't want us to simplify this into a zext of a compare,
24190 if (NotExtCompare && N2C->isOne())
24194 // zext (setcc n0, n1)
24196 SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC);
24197 if (VT.bitsLT(SCC.getValueType()))
24198 Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT);
24200 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
24202 SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
24203 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
24206 AddToWorklist(SCC.getNode());
24207 AddToWorklist(Temp.getNode());
24212 unsigned ShCt = N2C->getAPIntValue().logBase2();
24213 if (TLI.shouldAvoidTransformToShift(VT, ShCt))
24216 // shl setcc result by log2 n2c
24217 return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
24218 DAG.getConstant(ShCt, SDLoc(Temp),
24219 getShiftAmountTy(Temp.getValueType())));
24222 // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
24223 // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
24224 // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
24225 // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
24226 // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
24227 // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
24228 // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
24229 // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
24230 if (N1C && N1C->isZero() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24231 SDValue ValueOnZero = N2;
24232 SDValue Count = N3;
24233 // If the condition is NE instead of E, swap the operands.
24234 if (CC == ISD::SETNE)
24235 std::swap(ValueOnZero, Count);
24236 // Check if the value on zero is a constant equal to the bits in the type.
24237 if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
24238 if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
24239 // If the other operand is cttz/cttz_zero_undef of N0, and cttz is
24240 // legal, combine to just cttz.
24241 if ((Count.getOpcode() == ISD::CTTZ ||
24242 Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
24243 N0 == Count.getOperand(0) &&
24244 (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT)))
24245 return DAG.getNode(ISD::CTTZ, DL, VT, N0);
24246 // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
24247 // legal, combine to just ctlz.
24248 if ((Count.getOpcode() == ISD::CTLZ ||
24249 Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
24250 N0 == Count.getOperand(0) &&
24251 (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT)))
24252 return DAG.getNode(ISD::CTLZ, DL, VT, N0);
24257 // Fold select_cc setgt X, -1, C, ~C -> xor (ashr X, BW-1), C
24258 // Fold select_cc setlt X, 0, C, ~C -> xor (ashr X, BW-1), ~C
24259 if (!NotExtCompare && N1C && N2C && N3C &&
24260 N2C->getAPIntValue() == ~N3C->getAPIntValue() &&
24261 ((N1C->isAllOnes() && CC == ISD::SETGT) ||
24262 (N1C->isZero() && CC == ISD::SETLT)) &&
24263 !TLI.shouldAvoidTransformToShift(VT, CmpOpVT.getScalarSizeInBits() - 1)) {
24264 SDValue ASR = DAG.getNode(
24265 ISD::SRA, DL, CmpOpVT, N0,
24266 DAG.getConstant(CmpOpVT.getScalarSizeInBits() - 1, DL, CmpOpVT));
24267 return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASR, DL, VT),
24268 DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT));
24271 if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG))
24273 if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG))
24279 /// This is a stub for TargetLowering::SimplifySetCC.
24280 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
24281 ISD::CondCode Cond, const SDLoc &DL,
24282 bool foldBooleans) {
24283 TargetLowering::DAGCombinerInfo
24284 DagCombineInfo(DAG, Level, false, this);
24285 return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
24288 /// Given an ISD::SDIV node expressing a divide by constant, return
24289 /// a DAG expression to select that will generate the same value by multiplying
24290 /// by a magic number.
24291 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
24292 SDValue DAGCombiner::BuildSDIV(SDNode *N) {
24293 // when optimising for minimum size, we don't want to expand a div to a mul
24295 if (DAG.getMachineFunction().getFunction().hasMinSize())
24298 SmallVector<SDNode *, 8> Built;
24299 if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) {
24300 for (SDNode *N : Built)
24308 /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
24309 /// DAG expression that will generate the same value by right shifting.
24310 SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
24311 ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
24315 // Avoid division by zero.
24319 SmallVector<SDNode *, 8> Built;
24320 if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) {
24321 for (SDNode *N : Built)
24329 /// Given an ISD::UDIV node expressing a divide by constant, return a DAG
24330 /// expression that will generate the same value by multiplying by a magic
24332 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
24333 SDValue DAGCombiner::BuildUDIV(SDNode *N) {
24334 // when optimising for minimum size, we don't want to expand a div to a mul
24336 if (DAG.getMachineFunction().getFunction().hasMinSize())
24339 SmallVector<SDNode *, 8> Built;
24340 if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) {
24341 for (SDNode *N : Built)
24349 /// Given an ISD::SREM node expressing a remainder by constant power of 2,
24350 /// return a DAG expression that will generate the same value.
24351 SDValue DAGCombiner::BuildSREMPow2(SDNode *N) {
24352 ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
24356 // Avoid division by zero.
24360 SmallVector<SDNode *, 8> Built;
24361 if (SDValue S = TLI.BuildSREMPow2(N, C->getAPIntValue(), DAG, Built)) {
24362 for (SDNode *N : Built)
24370 /// Determines the LogBase2 value for a non-null input value using the
24371 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
24372 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
24373 EVT VT = V.getValueType();
24374 SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
24375 SDValue Base = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
24376 SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
24380 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
24381 /// For the reciprocal, we need to find the zero of the function:
24382 /// F(X) = 1/X - A [which has a zero at X = 1/A]
24384 /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
24385 /// does not require additional intermediate precision]
24386 /// For the last iteration, put numerator N into it to gain more precision:
24387 /// Result = N X_i + X_i (N - N A X_i)
24388 SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
24389 SDNodeFlags Flags) {
24393 // TODO: Handle extended types?
24394 EVT VT = Op.getValueType();
24395 if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
24396 VT.getScalarType() != MVT::f64)
24399 // If estimates are explicitly disabled for this function, we're done.
24400 MachineFunction &MF = DAG.getMachineFunction();
24401 int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
24402 if (Enabled == TLI.ReciprocalEstimate::Disabled)
24405 // Estimates may be explicitly enabled for this type with a custom number of
24406 // refinement steps.
24407 int Iterations = TLI.getDivRefinementSteps(VT, MF);
24408 if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
24409 AddToWorklist(Est.getNode());
24413 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
24415 // Newton iterations: Est = Est + Est (N - Arg * Est)
24416 // If this is the last iteration, also multiply by the numerator.
24417 for (int i = 0; i < Iterations; ++i) {
24418 SDValue MulEst = Est;
24420 if (i == Iterations - 1) {
24421 MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
24422 AddToWorklist(MulEst.getNode());
24425 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
24426 AddToWorklist(NewEst.getNode());
24428 NewEst = DAG.getNode(ISD::FSUB, DL, VT,
24429 (i == Iterations - 1 ? N : FPOne), NewEst, Flags);
24430 AddToWorklist(NewEst.getNode());
24432 NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
24433 AddToWorklist(NewEst.getNode());
24435 Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
24436 AddToWorklist(Est.getNode());
24439 // If no iterations are available, multiply with N.
24440 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
24441 AddToWorklist(Est.getNode());
24450 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
24451 /// For the reciprocal sqrt, we need to find the zero of the function:
24452 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
24454 /// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
24455 /// As a result, we precompute A/2 prior to the iteration loop.
24456 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
24457 unsigned Iterations,
24458 SDNodeFlags Flags, bool Reciprocal) {
24459 EVT VT = Arg.getValueType();
24461 SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
24463 // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
24464 // this entire sequence requires only one FP constant.
24465 SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
24466 HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
24468 // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
24469 for (unsigned i = 0; i < Iterations; ++i) {
24470 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
24471 NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
24472 NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
24473 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
24476 // If non-reciprocal square root is requested, multiply the result by Arg.
24478 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
24483 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
24484 /// For the reciprocal sqrt, we need to find the zero of the function:
24485 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
24487 /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
24488 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
24489 unsigned Iterations,
24490 SDNodeFlags Flags, bool Reciprocal) {
24491 EVT VT = Arg.getValueType();
24493 SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
24494 SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
24496 // This routine must enter the loop below to work correctly
24497 // when (Reciprocal == false).
24498 assert(Iterations > 0);
24500 // Newton iterations for reciprocal square root:
24501 // E = (E * -0.5) * ((A * E) * E + -3.0)
24502 for (unsigned i = 0; i < Iterations; ++i) {
24503 SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
24504 SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
24505 SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
24507 // When calculating a square root at the last iteration build:
24508 // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
24509 // (notice a common subexpression)
24511 if (Reciprocal || (i + 1) < Iterations) {
24512 // RSQRT: LHS = (E * -0.5)
24513 LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
24515 // SQRT: LHS = (A * E) * -0.5
24516 LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
24519 Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
24525 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
24526 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
24527 /// Op can be zero.
24528 SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
24533 // TODO: Handle extended types?
24534 EVT VT = Op.getValueType();
24535 if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
24536 VT.getScalarType() != MVT::f64)
24539 // If estimates are explicitly disabled for this function, we're done.
24540 MachineFunction &MF = DAG.getMachineFunction();
24541 int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
24542 if (Enabled == TLI.ReciprocalEstimate::Disabled)
24545 // Estimates may be explicitly enabled for this type with a custom number of
24546 // refinement steps.
24547 int Iterations = TLI.getSqrtRefinementSteps(VT, MF);
24549 bool UseOneConstNR = false;
24551 TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
24553 AddToWorklist(Est.getNode());
24556 Est = UseOneConstNR
24557 ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
24558 : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
24561 // Try the target specific test first.
24562 SDValue Test = TLI.getSqrtInputTest(Op, DAG, DAG.getDenormalMode(VT));
24564 // The estimate is now completely wrong if the input was exactly 0.0 or
24565 // possibly a denormal. Force the answer to 0.0 or value provided by
24566 // target for those cases.
24568 Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
24569 Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est);
24577 SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
24578 return buildSqrtEstimateImpl(Op, Flags, true);
24581 SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
24582 return buildSqrtEstimateImpl(Op, Flags, false);
24585 /// Return true if there is any possibility that the two addresses overlap.
24586 bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
24588 struct MemUseCharacteristics {
24593 Optional<int64_t> NumBytes;
24594 MachineMemOperand *MMO;
24597 auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
24598 if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
24599 int64_t Offset = 0;
24600 if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
24601 Offset = (LSN->getAddressingMode() == ISD::PRE_INC)
24602 ? C->getSExtValue()
24603 : (LSN->getAddressingMode() == ISD::PRE_DEC)
24604 ? -1 * C->getSExtValue()
24607 MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
24608 return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
24609 Offset /*base offset*/,
24610 Optional<int64_t>(Size),
24611 LSN->getMemOperand()};
24613 if (const auto *LN = cast<LifetimeSDNode>(N))
24614 return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1),
24615 (LN->hasOffset()) ? LN->getOffset() : 0,
24616 (LN->hasOffset()) ? Optional<int64_t>(LN->getSize())
24617 : Optional<int64_t>(),
24618 (MachineMemOperand *)nullptr};
24620 return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(),
24621 (int64_t)0 /*offset*/,
24622 Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr};
24625 MemUseCharacteristics MUC0 = getCharacteristics(Op0),
24626 MUC1 = getCharacteristics(Op1);
24628 // If they are to the same address, then they must be aliases.
24629 if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr &&
24630 MUC0.Offset == MUC1.Offset)
24633 // If they are both volatile then they cannot be reordered.
24634 if (MUC0.IsVolatile && MUC1.IsVolatile)
24637 // Be conservative about atomics for the moment
24638 // TODO: This is way overconservative for unordered atomics (see D66309)
24639 if (MUC0.IsAtomic && MUC1.IsAtomic)
24642 if (MUC0.MMO && MUC1.MMO) {
24643 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
24644 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
24648 // Try to prove that there is aliasing, or that there is no aliasing. Either
24649 // way, we can return now. If nothing can be proved, proceed with more tests.
24651 if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes,
24655 // The following all rely on MMO0 and MMO1 being valid. Fail conservatively if
24656 // either are not known.
24657 if (!MUC0.MMO || !MUC1.MMO)
24660 // If one operation reads from invariant memory, and the other may store, they
24661 // cannot alias. These should really be checking the equivalent of mayWrite,
24662 // but it only matters for memory nodes other than load /store.
24663 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
24664 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
24667 // If we know required SrcValue1 and SrcValue2 have relatively large
24668 // alignment compared to the size and offset of the access, we may be able
24669 // to prove they do not alias. This check is conservative for now to catch
24670 // cases created by splitting vector types, it only works when the offsets are
24671 // multiples of the size of the data.
24672 int64_t SrcValOffset0 = MUC0.MMO->getOffset();
24673 int64_t SrcValOffset1 = MUC1.MMO->getOffset();
24674 Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
24675 Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
24676 auto &Size0 = MUC0.NumBytes;
24677 auto &Size1 = MUC1.NumBytes;
24678 if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
24679 Size0.has_value() && Size1.has_value() && *Size0 == *Size1 &&
24680 OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
24681 SrcValOffset1 % *Size1 == 0) {
24682 int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
24683 int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
24685 // There is no overlap between these relatively aligned accesses of
24686 // similar size. Return no alias.
24687 if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
24691 bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
24693 : DAG.getSubtarget().useAA();
24695 if (CombinerAAOnlyFunc.getNumOccurrences() &&
24696 CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
24700 if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0 &&
24702 // Use alias analysis information.
24703 int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
24704 int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
24705 int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
24707 MemoryLocation(MUC0.MMO->getValue(), Overlap0,
24708 UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
24709 MemoryLocation(MUC1.MMO->getValue(), Overlap1,
24710 UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes())))
24714 // Otherwise we have to assume they alias.
24718 /// Walk up chain skipping non-aliasing memory nodes,
24719 /// looking for aliasing nodes and adding them to the Aliases vector.
24720 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
24721 SmallVectorImpl<SDValue> &Aliases) {
24722 SmallVector<SDValue, 8> Chains; // List of chains to visit.
24723 SmallPtrSet<SDNode *, 16> Visited; // Visited node set.
24725 // Get alias information for node.
24726 // TODO: relax aliasing for unordered atomics (see D66309)
24727 const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();
24730 Chains.push_back(OriginalChain);
24731 unsigned Depth = 0;
24733 // Attempt to improve chain by a single step
24734 auto ImproveChain = [&](SDValue &C) -> bool {
24735 switch (C.getOpcode()) {
24736 case ISD::EntryToken:
24737 // No need to mark EntryToken.
24742 // Get alias information for C.
24743 // TODO: Relax aliasing for unordered atomics (see D66309)
24744 bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
24745 cast<LSBaseSDNode>(C.getNode())->isSimple();
24746 if ((IsLoad && IsOpLoad) || !mayAlias(N, C.getNode())) {
24747 // Look further up the chain.
24748 C = C.getOperand(0);
24751 // Alias, so stop here.
24755 case ISD::CopyFromReg:
24756 // Always forward past past CopyFromReg.
24757 C = C.getOperand(0);
24760 case ISD::LIFETIME_START:
24761 case ISD::LIFETIME_END: {
24762 // We can forward past any lifetime start/end that can be proven not to
24763 // alias the memory access.
24764 if (!mayAlias(N, C.getNode())) {
24765 // Look further up the chain.
24766 C = C.getOperand(0);
24776 // Look at each chain and determine if it is an alias. If so, add it to the
24777 // aliases list. If not, then continue up the chain looking for the next
24779 while (!Chains.empty()) {
24780 SDValue Chain = Chains.pop_back_val();
24782 // Don't bother if we've seen Chain before.
24783 if (!Visited.insert(Chain.getNode()).second)
24786 // For TokenFactor nodes, look at each operand and only continue up the
24787 // chain until we reach the depth limit.
24789 // FIXME: The depth check could be made to return the last non-aliasing
24790 // chain we found before we hit a tokenfactor rather than the original
24792 if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
24794 Aliases.push_back(OriginalChain);
24798 if (Chain.getOpcode() == ISD::TokenFactor) {
24799 // We have to check each of the operands of the token factor for "small"
24800 // token factors, so we queue them up. Adding the operands to the queue
24801 // (stack) in reverse order maintains the original order and increases the
24802 // likelihood that getNode will find a matching token factor (CSE.)
24803 if (Chain.getNumOperands() > 16) {
24804 Aliases.push_back(Chain);
24807 for (unsigned n = Chain.getNumOperands(); n;)
24808 Chains.push_back(Chain.getOperand(--n));
24813 if (ImproveChain(Chain)) {
24814 // Updated Chain Found, Consider new chain if one exists.
24815 if (Chain.getNode())
24816 Chains.push_back(Chain);
24820 // No Improved Chain Possible, treat as Alias.
24821 Aliases.push_back(Chain);
24825 /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
24826 /// (aliasing node.)
24827 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
24828 if (OptLevel == CodeGenOpt::None)
24831 // Ops for replacing token factor.
24832 SmallVector<SDValue, 8> Aliases;
24834 // Accumulate all the aliases to this node.
24835 GatherAllAliases(N, OldChain, Aliases);
24837 // If no operands then chain to entry token.
24838 if (Aliases.size() == 0)
24839 return DAG.getEntryNode();
24841 // If a single operand then chain to it. We don't need to revisit it.
24842 if (Aliases.size() == 1)
24845 // Construct a custom tailored token factor.
24846 return DAG.getTokenFactor(SDLoc(N), Aliases);
24850 // TODO: Replace with with std::monostate when we move to C++17.
24851 struct UnitT { } Unit;
24852 bool operator==(const UnitT &, const UnitT &) { return true; }
24853 bool operator!=(const UnitT &, const UnitT &) { return false; }
24856 // This function tries to collect a bunch of potentially interesting
24857 // nodes to improve the chains of, all at once. This might seem
24858 // redundant, as this function gets called when visiting every store
24859 // node, so why not let the work be done on each store as it's visited?
24861 // I believe this is mainly important because mergeConsecutiveStores
24862 // is unable to deal with merging stores of different sizes, so unless
24863 // we improve the chains of all the potential candidates up-front
24864 // before running mergeConsecutiveStores, it might only see some of
24865 // the nodes that will eventually be candidates, and then not be able
24866 // to go from a partially-merged state to the desired final
24867 // fully-merged state.
24869 bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
24870 SmallVector<StoreSDNode *, 8> ChainedStores;
24871 StoreSDNode *STChain = St;
24872 // Intervals records which offsets from BaseIndex have been covered. In
24873 // the common case, every store writes to the immediately previous address
24874 // space and thus merged with the previous interval at insertion time.
24877 llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>;
24881 // This holds the base pointer, index, and the offset in bytes from the base
24883 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
24885 // We must have a base and an offset.
24886 if (!BasePtr.getBase().getNode())
24889 // Do not handle stores to undef base pointers.
24890 if (BasePtr.getBase().isUndef())
24893 // Do not handle stores to opaque types
24894 if (St->getMemoryVT().isZeroSized())
24897 // BaseIndexOffset assumes that offsets are fixed-size, which
24898 // is not valid for scalable vectors where the offsets are
24899 // scaled by `vscale`, so bail out early.
24900 if (St->getMemoryVT().isScalableVector())
24903 // Add ST's interval.
24904 Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit);
24906 while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
24907 if (Chain->getMemoryVT().isScalableVector())
24910 // If the chain has more than one use, then we can't reorder the mem ops.
24911 if (!SDValue(Chain, 0)->hasOneUse())
24913 // TODO: Relax for unordered atomics (see D66309)
24914 if (!Chain->isSimple() || Chain->isIndexed())
24917 // Find the base pointer and offset for this memory node.
24918 const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG);
24919 // Check that the base pointer is the same as the original one.
24921 if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset))
24923 int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8;
24924 // Make sure we don't overlap with other intervals by checking the ones to
24925 // the left or right before inserting.
24926 auto I = Intervals.find(Offset);
24927 // If there's a next interval, we should end before it.
24928 if (I != Intervals.end() && I.start() < (Offset + Length))
24930 // If there's a previous interval, we should start after it.
24931 if (I != Intervals.begin() && (--I).stop() <= Offset)
24933 Intervals.insert(Offset, Offset + Length, Unit);
24935 ChainedStores.push_back(Chain);
24939 // If we didn't find a chained store, exit.
24940 if (ChainedStores.size() == 0)
24943 // Improve all chained stores (St and ChainedStores members) starting from
24944 // where the store chain ended and return single TokenFactor.
24945 SDValue NewChain = STChain->getChain();
24946 SmallVector<SDValue, 8> TFOps;
24947 for (unsigned I = ChainedStores.size(); I;) {
24948 StoreSDNode *S = ChainedStores[--I];
24949 SDValue BetterChain = FindBetterChain(S, NewChain);
24950 S = cast<StoreSDNode>(DAG.UpdateNodeOperands(
24951 S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3)));
24952 TFOps.push_back(SDValue(S, 0));
24953 ChainedStores[I] = S;
24956 // Improve St's chain. Use a new node to avoid creating a loop from CombineTo.
24957 SDValue BetterChain = FindBetterChain(St, NewChain);
24959 if (St->isTruncatingStore())
24960 NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(),
24961 St->getBasePtr(), St->getMemoryVT(),
24962 St->getMemOperand());
24964 NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(),
24965 St->getBasePtr(), St->getMemOperand());
24967 TFOps.push_back(NewST);
24969 // If we improved every element of TFOps, then we've lost the dependence on
24970 // NewChain to successors of St and we need to add it back to TFOps. Do so at
24971 // the beginning to keep relative order consistent with FindBetterChains.
24972 auto hasImprovedChain = [&](SDValue ST) -> bool {
24973 return ST->getOperand(0) != NewChain;
24975 bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain);
24977 TFOps.insert(TFOps.begin(), NewChain);
24979 SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps);
24982 // Add TF and its operands to the worklist.
24983 AddToWorklist(TF.getNode());
24984 for (const SDValue &Op : TF->ops())
24985 AddToWorklist(Op.getNode());
24986 AddToWorklist(STChain);
24990 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
24991 if (OptLevel == CodeGenOpt::None)
24994 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
24996 // We must have a base and an offset.
24997 if (!BasePtr.getBase().getNode())
25000 // Do not handle stores to undef base pointers.
25001 if (BasePtr.getBase().isUndef())
25004 // Directly improve a chain of disjoint stores starting at St.
25005 if (parallelizeChainedStores(St))
25008 // Improve St's Chain..
25009 SDValue BetterChain = FindBetterChain(St, St->getChain());
25010 if (St->getChain() != BetterChain) {
25011 replaceStoreChain(St, BetterChain);
25017 /// This is the entry point for the file.
25018 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
25019 CodeGenOpt::Level OptLevel) {
25020 /// This is the main entry point to this class.
25021 DAGCombiner(*this, AA, OptLevel).Run(Level);