1 //===- DAGCombiner.cpp - Implement a DAG node combiner --------------------===//
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
9 // This pass combines dag nodes to form fewer, simpler DAG nodes. It can be run
10 // both before and after the DAG is legalized.
12 // This pass is not a substitute for the LLVM IR instcombine pass. This pass is
13 // primarily intended to handle simplification opportunities that are implicit
14 // in the LLVM IR and exposed by the various codegen lowering phases.
16 //===----------------------------------------------------------------------===//
18 #include "llvm/ADT/APFloat.h"
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/DenseMap.h"
22 #include "llvm/ADT/IntervalMap.h"
23 #include "llvm/ADT/None.h"
24 #include "llvm/ADT/Optional.h"
25 #include "llvm/ADT/STLExtras.h"
26 #include "llvm/ADT/SetVector.h"
27 #include "llvm/ADT/SmallBitVector.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/AliasAnalysis.h"
33 #include "llvm/Analysis/MemoryLocation.h"
34 #include "llvm/Analysis/TargetLibraryInfo.h"
35 #include "llvm/Analysis/VectorUtils.h"
36 #include "llvm/CodeGen/DAGCombine.h"
37 #include "llvm/CodeGen/ISDOpcodes.h"
38 #include "llvm/CodeGen/MachineFunction.h"
39 #include "llvm/CodeGen/MachineMemOperand.h"
40 #include "llvm/CodeGen/RuntimeLibcalls.h"
41 #include "llvm/CodeGen/SelectionDAG.h"
42 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
43 #include "llvm/CodeGen/SelectionDAGNodes.h"
44 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
45 #include "llvm/CodeGen/TargetLowering.h"
46 #include "llvm/CodeGen/TargetRegisterInfo.h"
47 #include "llvm/CodeGen/TargetSubtargetInfo.h"
48 #include "llvm/CodeGen/ValueTypes.h"
49 #include "llvm/IR/Attributes.h"
50 #include "llvm/IR/Constant.h"
51 #include "llvm/IR/DataLayout.h"
52 #include "llvm/IR/DerivedTypes.h"
53 #include "llvm/IR/Function.h"
54 #include "llvm/IR/Metadata.h"
55 #include "llvm/Support/Casting.h"
56 #include "llvm/Support/CodeGen.h"
57 #include "llvm/Support/CommandLine.h"
58 #include "llvm/Support/Compiler.h"
59 #include "llvm/Support/Debug.h"
60 #include "llvm/Support/ErrorHandling.h"
61 #include "llvm/Support/KnownBits.h"
62 #include "llvm/Support/MachineValueType.h"
63 #include "llvm/Support/MathExtras.h"
64 #include "llvm/Support/raw_ostream.h"
65 #include "llvm/Target/TargetMachine.h"
66 #include "llvm/Target/TargetOptions.h"
78 #define DEBUG_TYPE "dagcombine"
80 STATISTIC(NodesCombined , "Number of dag nodes combined");
81 STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
82 STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
83 STATISTIC(OpsNarrowed , "Number of load/op/store narrowed");
84 STATISTIC(LdStFP2Int , "Number of fp load/store pairs transformed to int");
85 STATISTIC(SlicedLoads, "Number of load sliced");
86 STATISTIC(NumFPLogicOpsConv, "Number of logic ops converted to fp ops");
89 CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
90 cl::desc("Enable DAG combiner's use of IR alias analysis"));
93 UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
94 cl::desc("Enable DAG combiner's use of TBAA"));
97 static cl::opt<std::string>
98 CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
99 cl::desc("Only use DAG-combiner alias analysis in this"
103 /// Hidden option to stress test load slicing, i.e., when this option
104 /// is enabled, load slicing bypasses most of its profitability guards.
106 StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
107 cl::desc("Bypass the profitability model of load slicing"),
111 MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
112 cl::desc("DAG combiner may split indexing from loads"));
115 EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true),
116 cl::desc("DAG combiner enable merging multiple stores "
117 "into a wider store"));
119 static cl::opt<unsigned> TokenFactorInlineLimit(
120 "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048),
121 cl::desc("Limit the number of operands to inline for Token Factors"));
123 static cl::opt<unsigned> StoreMergeDependenceLimit(
124 "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10),
125 cl::desc("Limit the number of times for the same StoreNode and RootNode "
126 "to bail out in store merging dependence check"));
128 static cl::opt<bool> EnableReduceLoadOpStoreWidth(
129 "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
130 cl::desc("DAG combiner enable reducing the width of load/op/store "
133 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
134 "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
135 cl::desc("DAG combiner enable load/<replace bytes>/store with "
136 "a narrower store"));
142 const TargetLowering &TLI;
143 const SelectionDAGTargetInfo *STI;
144 CombineLevel Level = BeforeLegalizeTypes;
145 CodeGenOpt::Level OptLevel;
146 bool LegalDAG = false;
147 bool LegalOperations = false;
148 bool LegalTypes = false;
150 bool DisableGenericCombines;
152 /// Worklist of all of the nodes that need to be simplified.
154 /// This must behave as a stack -- new nodes to process are pushed onto the
155 /// back and when processing we pop off of the back.
157 /// The worklist will not contain duplicates but may contain null entries
158 /// due to nodes being deleted from the underlying DAG.
159 SmallVector<SDNode *, 64> Worklist;
161 /// Mapping from an SDNode to its position on the worklist.
163 /// This is used to find and remove nodes from the worklist (by nulling
164 /// them) when they are deleted from the underlying DAG. It relies on
165 /// stable indices of nodes within the worklist.
166 DenseMap<SDNode *, unsigned> WorklistMap;
167 /// This records all nodes attempted to add to the worklist since we
168 /// considered a new worklist entry. As we keep do not add duplicate nodes
169 /// in the worklist, this is different from the tail of the worklist.
170 SmallSetVector<SDNode *, 32> PruningList;
172 /// Set of nodes which have been combined (at least once).
174 /// This is used to allow us to reliably add any operands of a DAG node
175 /// which have not yet been combined to the worklist.
176 SmallPtrSet<SDNode *, 32> CombinedNodes;
178 /// Map from candidate StoreNode to the pair of RootNode and count.
179 /// The count is used to track how many times we have seen the StoreNode
180 /// with the same RootNode bail out in dependence check. If we have seen
181 /// the bail out for the same pair many times over a limit, we won't
182 /// consider the StoreNode with the same RootNode as store merging
184 DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap;
186 // AA - Used for DAG load/store alias analysis.
189 /// When an instruction is simplified, add all users of the instruction to
190 /// the work lists because they might get more simplified now.
191 void AddUsersToWorklist(SDNode *N) {
192 for (SDNode *Node : N->uses())
196 /// Convenient shorthand to add a node and all of its user to the worklist.
197 void AddToWorklistWithUsers(SDNode *N) {
198 AddUsersToWorklist(N);
202 // Prune potentially dangling nodes. This is called after
203 // any visit to a node, but should also be called during a visit after any
204 // failed combine which may have created a DAG node.
205 void clearAddedDanglingWorklistEntries() {
206 // Check any nodes added to the worklist to see if they are prunable.
207 while (!PruningList.empty()) {
208 auto *N = PruningList.pop_back_val();
210 recursivelyDeleteUnusedNodes(N);
214 SDNode *getNextWorklistEntry() {
215 // Before we do any work, remove nodes that are not in use.
216 clearAddedDanglingWorklistEntries();
218 // The Worklist holds the SDNodes in order, but it may contain null
220 while (!N && !Worklist.empty()) {
221 N = Worklist.pop_back_val();
225 bool GoodWorklistEntry = WorklistMap.erase(N);
226 (void)GoodWorklistEntry;
227 assert(GoodWorklistEntry &&
228 "Found a worklist entry without a corresponding map entry!");
233 /// Call the node-specific routine that folds each particular type of node.
234 SDValue visit(SDNode *N);
237 DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
238 : DAG(D), TLI(D.getTargetLoweringInfo()),
239 STI(D.getSubtarget().getSelectionDAGInfo()), OptLevel(OL), AA(AA) {
240 ForCodeSize = DAG.shouldOptForSize();
241 DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);
243 MaximumLegalStoreInBits = 0;
244 // We use the minimum store size here, since that's all we can guarantee
245 // for the scalable vector types.
246 for (MVT VT : MVT::all_valuetypes())
247 if (EVT(VT).isSimple() && VT != MVT::Other &&
248 TLI.isTypeLegal(EVT(VT)) &&
249 VT.getSizeInBits().getKnownMinSize() >= MaximumLegalStoreInBits)
250 MaximumLegalStoreInBits = VT.getSizeInBits().getKnownMinSize();
253 void ConsiderForPruning(SDNode *N) {
254 // Mark this for potential pruning.
255 PruningList.insert(N);
258 /// Add to the worklist making sure its instance is at the back (next to be
260 void AddToWorklist(SDNode *N) {
261 assert(N->getOpcode() != ISD::DELETED_NODE &&
262 "Deleted Node added to Worklist");
264 // Skip handle nodes as they can't usefully be combined and confuse the
265 // zero-use deletion strategy.
266 if (N->getOpcode() == ISD::HANDLENODE)
269 ConsiderForPruning(N);
271 if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
272 Worklist.push_back(N);
275 /// Remove all instances of N from the worklist.
276 void removeFromWorklist(SDNode *N) {
277 CombinedNodes.erase(N);
278 PruningList.remove(N);
279 StoreRootCountMap.erase(N);
281 auto It = WorklistMap.find(N);
282 if (It == WorklistMap.end())
283 return; // Not in the worklist.
285 // Null out the entry rather than erasing it to avoid a linear operation.
286 Worklist[It->second] = nullptr;
287 WorklistMap.erase(It);
290 void deleteAndRecombine(SDNode *N);
291 bool recursivelyDeleteUnusedNodes(SDNode *N);
293 /// Replaces all uses of the results of one DAG node with new values.
294 SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
297 /// Replaces all uses of the results of one DAG node with new values.
298 SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
299 return CombineTo(N, &Res, 1, AddTo);
302 /// Replaces all uses of the results of one DAG node with new values.
303 SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
305 SDValue To[] = { Res0, Res1 };
306 return CombineTo(N, To, 2, AddTo);
309 void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
312 unsigned MaximumLegalStoreInBits;
314 /// Check the specified integer node value to see if it can be simplified or
315 /// if things it uses can be simplified by bit propagation.
316 /// If so, return true.
317 bool SimplifyDemandedBits(SDValue Op) {
318 unsigned BitWidth = Op.getScalarValueSizeInBits();
319 APInt DemandedBits = APInt::getAllOnes(BitWidth);
320 return SimplifyDemandedBits(Op, DemandedBits);
323 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) {
324 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
326 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false))
330 AddToWorklist(Op.getNode());
332 CommitTargetLoweringOpt(TLO);
336 /// Check the specified vector node value to see if it can be simplified or
337 /// if things it uses can be simplified as it only uses some of the
338 /// elements. If so, return true.
339 bool SimplifyDemandedVectorElts(SDValue Op) {
340 // TODO: For now just pretend it cannot be simplified.
341 if (Op.getValueType().isScalableVector())
344 unsigned NumElts = Op.getValueType().getVectorNumElements();
345 APInt DemandedElts = APInt::getAllOnes(NumElts);
346 return SimplifyDemandedVectorElts(Op, DemandedElts);
349 bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
350 const APInt &DemandedElts,
351 bool AssumeSingleUse = false);
352 bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
353 bool AssumeSingleUse = false);
355 bool CombineToPreIndexedLoadStore(SDNode *N);
356 bool CombineToPostIndexedLoadStore(SDNode *N);
357 SDValue SplitIndexingFromLoad(LoadSDNode *LD);
358 bool SliceUpLoad(SDNode *N);
360 // Scalars have size 0 to distinguish from singleton vectors.
361 SDValue ForwardStoreValueToDirectLoad(LoadSDNode *LD);
362 bool getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val);
363 bool extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val);
365 /// Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
368 /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
369 /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
370 /// \param EltNo index of the vector element to load.
371 /// \param OriginalLoad load that EVE came from to be replaced.
372 /// \returns EVE on success SDValue() on failure.
373 SDValue scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
375 LoadSDNode *OriginalLoad);
376 void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
377 SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
378 SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
379 SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
380 SDValue PromoteIntBinOp(SDValue Op);
381 SDValue PromoteIntShiftOp(SDValue Op);
382 SDValue PromoteExtend(SDValue Op);
383 bool PromoteLoad(SDValue Op);
385 /// Call the node-specific routine that knows how to fold each
386 /// particular type of node. If that doesn't do anything, try the
387 /// target-specific DAG combines.
388 SDValue combine(SDNode *N);
390 // Visitation implementation - Implement dag node combining for different
391 // node types. The semantics are as follows:
393 // SDValue.getNode() == 0 - No change was made
394 // SDValue.getNode() == N - N was replaced, is dead and has been handled.
395 // otherwise - N should be replaced by the returned Operand.
397 SDValue visitTokenFactor(SDNode *N);
398 SDValue visitMERGE_VALUES(SDNode *N);
399 SDValue visitADD(SDNode *N);
400 SDValue visitADDLike(SDNode *N);
401 SDValue visitADDLikeCommutative(SDValue N0, SDValue N1, SDNode *LocReference);
402 SDValue visitSUB(SDNode *N);
403 SDValue visitADDSAT(SDNode *N);
404 SDValue visitSUBSAT(SDNode *N);
405 SDValue visitADDC(SDNode *N);
406 SDValue visitADDO(SDNode *N);
407 SDValue visitUADDOLike(SDValue N0, SDValue N1, SDNode *N);
408 SDValue visitSUBC(SDNode *N);
409 SDValue visitSUBO(SDNode *N);
410 SDValue visitADDE(SDNode *N);
411 SDValue visitADDCARRY(SDNode *N);
412 SDValue visitSADDO_CARRY(SDNode *N);
413 SDValue visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn, SDNode *N);
414 SDValue visitSUBE(SDNode *N);
415 SDValue visitSUBCARRY(SDNode *N);
416 SDValue visitSSUBO_CARRY(SDNode *N);
417 SDValue visitMUL(SDNode *N);
418 SDValue visitMULFIX(SDNode *N);
419 SDValue useDivRem(SDNode *N);
420 SDValue visitSDIV(SDNode *N);
421 SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
422 SDValue visitUDIV(SDNode *N);
423 SDValue visitUDIVLike(SDValue N0, SDValue N1, SDNode *N);
424 SDValue visitREM(SDNode *N);
425 SDValue visitMULHU(SDNode *N);
426 SDValue visitMULHS(SDNode *N);
427 SDValue visitAVG(SDNode *N);
428 SDValue visitSMUL_LOHI(SDNode *N);
429 SDValue visitUMUL_LOHI(SDNode *N);
430 SDValue visitMULO(SDNode *N);
431 SDValue visitIMINMAX(SDNode *N);
432 SDValue visitAND(SDNode *N);
433 SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *N);
434 SDValue visitOR(SDNode *N);
435 SDValue visitORLike(SDValue N0, SDValue N1, SDNode *N);
436 SDValue visitXOR(SDNode *N);
437 SDValue SimplifyVBinOp(SDNode *N, const SDLoc &DL);
438 SDValue visitSHL(SDNode *N);
439 SDValue visitSRA(SDNode *N);
440 SDValue visitSRL(SDNode *N);
441 SDValue visitFunnelShift(SDNode *N);
442 SDValue visitSHLSAT(SDNode *N);
443 SDValue visitRotate(SDNode *N);
444 SDValue visitABS(SDNode *N);
445 SDValue visitBSWAP(SDNode *N);
446 SDValue visitBITREVERSE(SDNode *N);
447 SDValue visitCTLZ(SDNode *N);
448 SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
449 SDValue visitCTTZ(SDNode *N);
450 SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
451 SDValue visitCTPOP(SDNode *N);
452 SDValue visitSELECT(SDNode *N);
453 SDValue visitVSELECT(SDNode *N);
454 SDValue visitSELECT_CC(SDNode *N);
455 SDValue visitSETCC(SDNode *N);
456 SDValue visitSETCCCARRY(SDNode *N);
457 SDValue visitSIGN_EXTEND(SDNode *N);
458 SDValue visitZERO_EXTEND(SDNode *N);
459 SDValue visitANY_EXTEND(SDNode *N);
460 SDValue visitAssertExt(SDNode *N);
461 SDValue visitAssertAlign(SDNode *N);
462 SDValue visitSIGN_EXTEND_INREG(SDNode *N);
463 SDValue visitEXTEND_VECTOR_INREG(SDNode *N);
464 SDValue visitTRUNCATE(SDNode *N);
465 SDValue visitBITCAST(SDNode *N);
466 SDValue visitFREEZE(SDNode *N);
467 SDValue visitBUILD_PAIR(SDNode *N);
468 SDValue visitFADD(SDNode *N);
469 SDValue visitSTRICT_FADD(SDNode *N);
470 SDValue visitFSUB(SDNode *N);
471 SDValue visitFMUL(SDNode *N);
472 SDValue visitFMA(SDNode *N);
473 SDValue visitFDIV(SDNode *N);
474 SDValue visitFREM(SDNode *N);
475 SDValue visitFSQRT(SDNode *N);
476 SDValue visitFCOPYSIGN(SDNode *N);
477 SDValue visitFPOW(SDNode *N);
478 SDValue visitSINT_TO_FP(SDNode *N);
479 SDValue visitUINT_TO_FP(SDNode *N);
480 SDValue visitFP_TO_SINT(SDNode *N);
481 SDValue visitFP_TO_UINT(SDNode *N);
482 SDValue visitFP_ROUND(SDNode *N);
483 SDValue visitFP_EXTEND(SDNode *N);
484 SDValue visitFNEG(SDNode *N);
485 SDValue visitFABS(SDNode *N);
486 SDValue visitFCEIL(SDNode *N);
487 SDValue visitFTRUNC(SDNode *N);
488 SDValue visitFFLOOR(SDNode *N);
489 SDValue visitFMinMax(SDNode *N);
490 SDValue visitBRCOND(SDNode *N);
491 SDValue visitBR_CC(SDNode *N);
492 SDValue visitLOAD(SDNode *N);
494 SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
495 SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
497 SDValue visitSTORE(SDNode *N);
498 SDValue visitLIFETIME_END(SDNode *N);
499 SDValue visitINSERT_VECTOR_ELT(SDNode *N);
500 SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
501 SDValue visitBUILD_VECTOR(SDNode *N);
502 SDValue visitCONCAT_VECTORS(SDNode *N);
503 SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
504 SDValue visitVECTOR_SHUFFLE(SDNode *N);
505 SDValue visitSCALAR_TO_VECTOR(SDNode *N);
506 SDValue visitINSERT_SUBVECTOR(SDNode *N);
507 SDValue visitMLOAD(SDNode *N);
508 SDValue visitMSTORE(SDNode *N);
509 SDValue visitMGATHER(SDNode *N);
510 SDValue visitMSCATTER(SDNode *N);
511 SDValue visitFP_TO_FP16(SDNode *N);
512 SDValue visitFP16_TO_FP(SDNode *N);
513 SDValue visitFP_TO_BF16(SDNode *N);
514 SDValue visitVECREDUCE(SDNode *N);
515 SDValue visitVPOp(SDNode *N);
517 SDValue visitFADDForFMACombine(SDNode *N);
518 SDValue visitFSUBForFMACombine(SDNode *N);
519 SDValue visitFMULForFMADistributiveCombine(SDNode *N);
521 SDValue XformToShuffleWithZero(SDNode *N);
522 bool reassociationCanBreakAddressingModePattern(unsigned Opc,
527 SDValue reassociateOpsCommutative(unsigned Opc, const SDLoc &DL, SDValue N0,
529 SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
530 SDValue N1, SDNodeFlags Flags);
532 SDValue visitShiftByConstant(SDNode *N);
534 SDValue foldSelectOfConstants(SDNode *N);
535 SDValue foldVSelectOfConstants(SDNode *N);
536 SDValue foldBinOpIntoSelect(SDNode *BO);
537 bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
538 SDValue hoistLogicOpWithSameOpcodeHands(SDNode *N);
539 SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
540 SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
541 SDValue N2, SDValue N3, ISD::CondCode CC,
542 bool NotExtCompare = false);
543 SDValue convertSelectOfFPConstantsToLoadOffset(
544 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
546 SDValue foldSignChangeInBitcast(SDNode *N);
547 SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
548 SDValue N2, SDValue N3, ISD::CondCode CC);
549 SDValue foldSelectOfBinops(SDNode *N);
550 SDValue foldSextSetcc(SDNode *N);
551 SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
553 SDValue foldSubToUSubSat(EVT DstVT, SDNode *N);
554 SDValue unfoldMaskedMerge(SDNode *N);
555 SDValue unfoldExtremeBitClearingToShifts(SDNode *N);
556 SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
557 const SDLoc &DL, bool foldBooleans);
558 SDValue rebuildSetCC(SDValue N);
560 bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
561 SDValue &CC, bool MatchStrict = false) const;
562 bool isOneUseSetCC(SDValue N) const;
564 SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
566 SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
567 SDValue CombineExtLoad(SDNode *N);
568 SDValue CombineZExtLogicopShiftLoad(SDNode *N);
569 SDValue combineRepeatedFPDivisors(SDNode *N);
570 SDValue combineInsertEltToShuffle(SDNode *N, unsigned InsIndex);
571 SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
572 SDValue BuildSDIV(SDNode *N);
573 SDValue BuildSDIVPow2(SDNode *N);
574 SDValue BuildUDIV(SDNode *N);
575 SDValue BuildSREMPow2(SDNode *N);
576 SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N);
577 SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
578 SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
579 SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
580 SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
581 SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
582 SDValue buildSqrtNROneConst(SDValue Arg, SDValue Est, unsigned Iterations,
583 SDNodeFlags Flags, bool Reciprocal);
584 SDValue buildSqrtNRTwoConst(SDValue Arg, SDValue Est, unsigned Iterations,
585 SDNodeFlags Flags, bool Reciprocal);
586 SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
587 bool DemandHighBits = true);
588 SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
589 SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
590 SDValue InnerPos, SDValue InnerNeg, bool HasPos,
591 unsigned PosOpcode, unsigned NegOpcode,
593 SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
594 SDValue InnerPos, SDValue InnerNeg, bool HasPos,
595 unsigned PosOpcode, unsigned NegOpcode,
597 SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
598 SDValue MatchLoadCombine(SDNode *N);
599 SDValue mergeTruncStores(StoreSDNode *N);
600 SDValue reduceLoadWidth(SDNode *N);
601 SDValue ReduceLoadOpStoreWidth(SDNode *N);
602 SDValue splitMergedValStore(StoreSDNode *ST);
603 SDValue TransformFPLoadStorePair(SDNode *N);
604 SDValue convertBuildVecZextToZext(SDNode *N);
605 SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
606 SDValue reduceBuildVecTruncToBitCast(SDNode *N);
607 SDValue reduceBuildVecToShuffle(SDNode *N);
608 SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
609 ArrayRef<int> VectorMask, SDValue VecIn1,
610 SDValue VecIn2, unsigned LeftIdx,
612 SDValue matchVSelectOpSizesWithSetCC(SDNode *Cast);
614 /// Walk up chain skipping non-aliasing memory nodes,
615 /// looking for aliasing nodes and adding them to the Aliases vector.
616 void GatherAllAliases(SDNode *N, SDValue OriginalChain,
617 SmallVectorImpl<SDValue> &Aliases);
619 /// Return true if there is any possibility that the two addresses overlap.
620 bool mayAlias(SDNode *Op0, SDNode *Op1) const;
622 /// Walk up chain skipping non-aliasing memory nodes, looking for a better
623 /// chain (aliasing node.)
624 SDValue FindBetterChain(SDNode *N, SDValue Chain);
626 /// Try to replace a store and any possibly adjacent stores on
627 /// consecutive chains with better chains. Return true only if St is
630 /// Notice that other chains may still be replaced even if the function
632 bool findBetterNeighborChains(StoreSDNode *St);
634 // Helper for findBetterNeighborChains. Walk up store chain add additional
635 // chained stores that do not overlap and can be parallelized.
636 bool parallelizeChainedStores(StoreSDNode *St);
638 /// Holds a pointer to an LSBaseSDNode as well as information on where it
639 /// is located in a sequence of memory operations connected by a chain.
641 // Ptr to the mem node.
642 LSBaseSDNode *MemNode;
644 // Offset from the base ptr.
645 int64_t OffsetFromBase;
647 MemOpLink(LSBaseSDNode *N, int64_t Offset)
648 : MemNode(N), OffsetFromBase(Offset) {}
651 // Classify the origin of a stored value.
652 enum class StoreSource { Unknown, Constant, Extract, Load };
653 StoreSource getStoreSource(SDValue StoreVal) {
654 switch (StoreVal.getOpcode()) {
656 case ISD::ConstantFP:
657 return StoreSource::Constant;
658 case ISD::EXTRACT_VECTOR_ELT:
659 case ISD::EXTRACT_SUBVECTOR:
660 return StoreSource::Extract;
662 return StoreSource::Load;
664 return StoreSource::Unknown;
668 /// This is a helper function for visitMUL to check the profitability
669 /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
670 /// MulNode is the original multiply, AddNode is (add x, c1),
671 /// and ConstNode is c2.
672 bool isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
675 /// This is a helper function for visitAND and visitZERO_EXTEND. Returns
676 /// true if the (and (load x) c) pattern matches an extload. ExtVT returns
677 /// the type of the loaded value to be extended.
678 bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
679 EVT LoadResultTy, EVT &ExtVT);
681 /// Helper function to calculate whether the given Load/Store can have its
682 /// width reduced to ExtVT.
683 bool isLegalNarrowLdSt(LSBaseSDNode *LDSTN, ISD::LoadExtType ExtType,
684 EVT &MemVT, unsigned ShAmt = 0);
686 /// Used by BackwardsPropagateMask to find suitable loads.
687 bool SearchForAndLoads(SDNode *N, SmallVectorImpl<LoadSDNode*> &Loads,
688 SmallPtrSetImpl<SDNode*> &NodesWithConsts,
689 ConstantSDNode *Mask, SDNode *&NodeToMask);
690 /// Attempt to propagate a given AND node back to load leaves so that they
691 /// can be combined into narrow loads.
692 bool BackwardsPropagateMask(SDNode *N);
694 /// Helper function for mergeConsecutiveStores which merges the component
696 SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
699 /// This is a helper function for mergeConsecutiveStores. When the source
700 /// elements of the consecutive stores are all constants or all extracted
701 /// vector elements, try to merge them into one larger store introducing
702 /// bitcasts if necessary. \return True if a merged store was created.
703 bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
704 EVT MemVT, unsigned NumStores,
705 bool IsConstantSrc, bool UseVector,
708 /// This is a helper function for mergeConsecutiveStores. Stores that
709 /// potentially may be merged with St are placed in StoreNodes. RootNode is
710 /// a chain predecessor to all store candidates.
711 void getStoreMergeCandidates(StoreSDNode *St,
712 SmallVectorImpl<MemOpLink> &StoreNodes,
715 /// Helper function for mergeConsecutiveStores. Checks if candidate stores
716 /// have indirect dependency through their operands. RootNode is the
717 /// predecessor to all stores calculated by getStoreMergeCandidates and is
718 /// used to prune the dependency check. \return True if safe to merge.
719 bool checkMergeStoreCandidatesForDependencies(
720 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
723 /// This is a helper function for mergeConsecutiveStores. Given a list of
724 /// store candidates, find the first N that are consecutive in memory.
725 /// Returns 0 if there are not at least 2 consecutive stores to try merging.
726 unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
727 int64_t ElementSizeBytes) const;
729 /// This is a helper function for mergeConsecutiveStores. It is used for
730 /// store chains that are composed entirely of constant values.
731 bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes,
732 unsigned NumConsecutiveStores,
733 EVT MemVT, SDNode *Root, bool AllowVectors);
735 /// This is a helper function for mergeConsecutiveStores. It is used for
736 /// store chains that are composed entirely of extracted vector elements.
737 /// When extracting multiple vector elements, try to store them in one
738 /// vector store rather than a sequence of scalar stores.
739 bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes,
740 unsigned NumConsecutiveStores, EVT MemVT,
743 /// This is a helper function for mergeConsecutiveStores. It is used for
744 /// store chains that are composed entirely of loaded values.
745 bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
746 unsigned NumConsecutiveStores, EVT MemVT,
747 SDNode *Root, bool AllowVectors,
748 bool IsNonTemporalStore, bool IsNonTemporalLoad);
750 /// Merge consecutive store operations into a wide store.
751 /// This optimization uses wide integers or vectors when possible.
752 /// \return true if stores were merged.
753 bool mergeConsecutiveStores(StoreSDNode *St);
755 /// Try to transform a truncation where C is a constant:
756 /// (trunc (and X, C)) -> (and (trunc X), (trunc C))
758 /// \p N needs to be a truncation and its first operand an AND. Other
759 /// requirements are checked by the function (e.g. that trunc is
760 /// single-use) and if missed an empty SDValue is returned.
761 SDValue distributeTruncateThroughAnd(SDNode *N);
763 /// Helper function to determine whether the target supports operation
764 /// given by \p Opcode for type \p VT, that is, whether the operation
765 /// is legal or custom before legalizing operations, and whether is
766 /// legal (but not custom) after legalization.
767 bool hasOperation(unsigned Opcode, EVT VT) {
768 return TLI.isOperationLegalOrCustom(Opcode, VT, LegalOperations);
772 /// Runs the dag combiner on all nodes in the work list
773 void Run(CombineLevel AtLevel);
775 SelectionDAG &getDAG() const { return DAG; }
777 /// Returns a type large enough to hold any valid shift amount - before type
778 /// legalization these can be huge.
779 EVT getShiftAmountTy(EVT LHSTy) {
780 assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
781 return TLI.getShiftAmountTy(LHSTy, DAG.getDataLayout(), LegalTypes);
784 /// This method returns true if we are running before type legalization or
785 /// if the specified VT is legal.
786 bool isTypeLegal(const EVT &VT) {
787 if (!LegalTypes) return true;
788 return TLI.isTypeLegal(VT);
791 /// Convenience wrapper around TargetLowering::getSetCCResultType
792 EVT getSetCCResultType(EVT VT) const {
793 return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
796 void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
797 SDValue OrigLoad, SDValue ExtLoad,
798 ISD::NodeType ExtType);
801 /// This class is a DAGUpdateListener that removes any deleted
802 /// nodes from the worklist.
803 class WorklistRemover : public SelectionDAG::DAGUpdateListener {
807 explicit WorklistRemover(DAGCombiner &dc)
808 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
810 void NodeDeleted(SDNode *N, SDNode *E) override {
811 DC.removeFromWorklist(N);
815 class WorklistInserter : public SelectionDAG::DAGUpdateListener {
819 explicit WorklistInserter(DAGCombiner &dc)
820 : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
822 // FIXME: Ideally we could add N to the worklist, but this causes exponential
823 // compile time costs in large DAGs, e.g. Halide.
824 void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
827 } // end anonymous namespace
829 //===----------------------------------------------------------------------===//
830 // TargetLowering::DAGCombinerInfo implementation
831 //===----------------------------------------------------------------------===//
833 void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
834 ((DAGCombiner*)DC)->AddToWorklist(N);
837 SDValue TargetLowering::DAGCombinerInfo::
838 CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
839 return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
842 SDValue TargetLowering::DAGCombinerInfo::
843 CombineTo(SDNode *N, SDValue Res, bool AddTo) {
844 return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
847 SDValue TargetLowering::DAGCombinerInfo::
848 CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
849 return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
852 bool TargetLowering::DAGCombinerInfo::
853 recursivelyDeleteUnusedNodes(SDNode *N) {
854 return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
857 void TargetLowering::DAGCombinerInfo::
858 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
859 return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
862 //===----------------------------------------------------------------------===//
864 //===----------------------------------------------------------------------===//
866 void DAGCombiner::deleteAndRecombine(SDNode *N) {
867 removeFromWorklist(N);
869 // If the operands of this node are only used by the node, they will now be
870 // dead. Make sure to re-visit them and recursively delete dead nodes.
871 for (const SDValue &Op : N->ops())
872 // For an operand generating multiple values, one of the values may
873 // become dead allowing further simplification (e.g. split index
874 // arithmetic from an indexed load).
875 if (Op->hasOneUse() || Op->getNumValues() > 1)
876 AddToWorklist(Op.getNode());
881 // APInts must be the same size for most operations, this helper
882 // function zero extends the shorter of the pair so that they match.
883 // We provide an Offset so that we can create bitwidths that won't overflow.
884 static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
885 unsigned Bits = Offset + std::max(LHS.getBitWidth(), RHS.getBitWidth());
886 LHS = LHS.zext(Bits);
887 RHS = RHS.zext(Bits);
890 // Return true if this node is a setcc, or is a select_cc
891 // that selects between the target values used for true and false, making it
892 // equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
893 // the appropriate nodes based on the type of node we are checking. This
894 // simplifies life a bit for the callers.
895 bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
896 SDValue &CC, bool MatchStrict) const {
897 if (N.getOpcode() == ISD::SETCC) {
898 LHS = N.getOperand(0);
899 RHS = N.getOperand(1);
900 CC = N.getOperand(2);
905 (N.getOpcode() == ISD::STRICT_FSETCC ||
906 N.getOpcode() == ISD::STRICT_FSETCCS)) {
907 LHS = N.getOperand(1);
908 RHS = N.getOperand(2);
909 CC = N.getOperand(3);
913 if (N.getOpcode() != ISD::SELECT_CC || !TLI.isConstTrueVal(N.getOperand(2)) ||
914 !TLI.isConstFalseVal(N.getOperand(3)))
917 if (TLI.getBooleanContents(N.getValueType()) ==
918 TargetLowering::UndefinedBooleanContent)
921 LHS = N.getOperand(0);
922 RHS = N.getOperand(1);
923 CC = N.getOperand(4);
927 /// Return true if this is a SetCC-equivalent operation with only one use.
928 /// If this is true, it allows the users to invert the operation for free when
929 /// it is profitable to do so.
930 bool DAGCombiner::isOneUseSetCC(SDValue N) const {
932 if (isSetCCEquivalent(N, N0, N1, N2) && N->hasOneUse())
937 static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) {
938 if (!ScalarTy.isSimple())
941 uint64_t MaskForTy = 0ULL;
942 switch (ScalarTy.getSimpleVT().SimpleTy) {
947 MaskForTy = 0xFFFFULL;
950 MaskForTy = 0xFFFFFFFFULL;
958 if (ISD::isConstantSplatVector(N, Val))
959 return Val.getLimitedValue() == MaskForTy;
964 // Determines if it is a constant integer or a splat/build vector of constant
965 // integers (and undefs).
966 // Do not permit build vector implicit truncation.
967 static bool isConstantOrConstantVector(SDValue N, bool NoOpaques = false) {
968 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N))
969 return !(Const->isOpaque() && NoOpaques);
970 if (N.getOpcode() != ISD::BUILD_VECTOR && N.getOpcode() != ISD::SPLAT_VECTOR)
972 unsigned BitWidth = N.getScalarValueSizeInBits();
973 for (const SDValue &Op : N->op_values()) {
976 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(Op);
977 if (!Const || Const->getAPIntValue().getBitWidth() != BitWidth ||
978 (Const->isOpaque() && NoOpaques))
984 // Determines if a BUILD_VECTOR is composed of all-constants possibly mixed with
986 static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {
987 if (V.getOpcode() != ISD::BUILD_VECTOR)
989 return isConstantOrConstantVector(V, NoOpaques) ||
990 ISD::isBuildVectorOfConstantFPSDNodes(V.getNode());
993 // Determine if this an indexed load with an opaque target constant index.
994 static bool canSplitIdx(LoadSDNode *LD) {
995 return MaySplitLoadIndex &&
996 (LD->getOperand(2).getOpcode() != ISD::TargetConstant ||
997 !cast<ConstantSDNode>(LD->getOperand(2))->isOpaque());
1000 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
1005 // Currently this only tries to ensure we don't undo the GEP splits done by
1006 // CodeGenPrepare when shouldConsiderGEPOffsetSplit is true. To ensure this,
1007 // we check if the following transformation would be problematic:
1008 // (load/store (add, (add, x, offset1), offset2)) ->
1009 // (load/store (add, x, offset1+offset2)).
1011 // (load/store (add, (add, x, y), offset2)) ->
1012 // (load/store (add, (add, x, offset2), y)).
1014 if (Opc != ISD::ADD || N0.getOpcode() != ISD::ADD)
1017 auto *C2 = dyn_cast<ConstantSDNode>(N1);
1021 const APInt &C2APIntVal = C2->getAPIntValue();
1022 if (C2APIntVal.getSignificantBits() > 64)
1025 if (auto *C1 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
1029 const APInt &C1APIntVal = C1->getAPIntValue();
1030 const APInt CombinedValueIntVal = C1APIntVal + C2APIntVal;
1031 if (CombinedValueIntVal.getSignificantBits() > 64)
1033 const int64_t CombinedValue = CombinedValueIntVal.getSExtValue();
1035 for (SDNode *Node : N->uses()) {
1036 if (auto *LoadStore = dyn_cast<MemSDNode>(Node)) {
1037 // Is x[offset2] already not a legal addressing mode? If so then
1038 // reassociating the constants breaks nothing (we test offset2 because
1039 // that's the one we hope to fold into the load or store).
1040 TargetLoweringBase::AddrMode AM;
1041 AM.HasBaseReg = true;
1042 AM.BaseOffs = C2APIntVal.getSExtValue();
1043 EVT VT = LoadStore->getMemoryVT();
1044 unsigned AS = LoadStore->getAddressSpace();
1045 Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
1046 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
1049 // Would x[offset1+offset2] still be a legal addressing mode?
1050 AM.BaseOffs = CombinedValue;
1051 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
1056 if (auto *GA = dyn_cast<GlobalAddressSDNode>(N0.getOperand(1)))
1057 if (GA->getOpcode() == ISD::GlobalAddress && TLI.isOffsetFoldingLegal(GA))
1060 for (SDNode *Node : N->uses()) {
1061 auto *LoadStore = dyn_cast<MemSDNode>(Node);
1065 // Is x[offset2] a legal addressing mode? If so then
1066 // reassociating the constants breaks address pattern
1067 TargetLoweringBase::AddrMode AM;
1068 AM.HasBaseReg = true;
1069 AM.BaseOffs = C2APIntVal.getSExtValue();
1070 EVT VT = LoadStore->getMemoryVT();
1071 unsigned AS = LoadStore->getAddressSpace();
1072 Type *AccessTy = VT.getTypeForEVT(*DAG.getContext());
1073 if (!TLI.isLegalAddressingMode(DAG.getDataLayout(), AM, AccessTy, AS))
1082 // Helper for DAGCombiner::reassociateOps. Try to reassociate an expression
1083 // such as (Opc N0, N1), if \p N0 is the same kind of operation as \p Opc.
1084 SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
1085 SDValue N0, SDValue N1) {
1086 EVT VT = N0.getValueType();
1088 if (N0.getOpcode() != Opc)
1091 SDValue N00 = N0.getOperand(0);
1092 SDValue N01 = N0.getOperand(1);
1094 if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N01))) {
1095 if (DAG.isConstantIntBuildVectorOrConstantInt(peekThroughBitcasts(N1))) {
1096 // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
1097 if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, {N01, N1}))
1098 return DAG.getNode(Opc, DL, VT, N00, OpNode);
1101 if (TLI.isReassocProfitable(DAG, N0, N1)) {
1102 // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1)
1103 // iff (op x, c1) has one use
1104 SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1);
1105 return DAG.getNode(Opc, DL, VT, OpNode, N01);
1109 // Check for repeated operand logic simplifications.
1110 if (Opc == ISD::AND || Opc == ISD::OR) {
1111 // (N00 & N01) & N00 --> N00 & N01
1112 // (N00 & N01) & N01 --> N00 & N01
1113 // (N00 | N01) | N00 --> N00 | N01
1114 // (N00 | N01) | N01 --> N00 | N01
1115 if (N1 == N00 || N1 == N01)
1118 if (Opc == ISD::XOR) {
1119 // (N00 ^ N01) ^ N00 --> N01
1122 // (N00 ^ N01) ^ N01 --> N00
1127 if (TLI.isReassocProfitable(DAG, N0, N1)) {
1129 // Reassociate if (op N00, N1) already exist
1130 if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N00, N1})) {
1131 // if Op (Op N00, N1), N01 already exist
1132 // we need to stop reassciate to avoid dead loop
1133 if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N01}))
1134 return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N01);
1139 // Reassociate if (op N01, N1) already exist
1140 if (SDNode *NE = DAG.getNodeIfExists(Opc, DAG.getVTList(VT), {N01, N1})) {
1141 // if Op (Op N01, N1), N00 already exist
1142 // we need to stop reassciate to avoid dead loop
1143 if (!DAG.doesNodeExist(Opc, DAG.getVTList(VT), {SDValue(NE, 0), N00}))
1144 return DAG.getNode(Opc, DL, VT, SDValue(NE, 0), N00);
1152 // Try to reassociate commutative binops.
1153 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
1154 SDValue N1, SDNodeFlags Flags) {
1155 assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
1157 // Floating-point reassociation is not allowed without loose FP math.
1158 if (N0.getValueType().isFloatingPoint() ||
1159 N1.getValueType().isFloatingPoint())
1160 if (!Flags.hasAllowReassociation() || !Flags.hasNoSignedZeros())
1163 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N0, N1))
1165 if (SDValue Combined = reassociateOpsCommutative(Opc, DL, N1, N0))
1170 SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
1172 assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
1174 LLVM_DEBUG(dbgs() << "\nReplacing.1 "; N->dump(&DAG); dbgs() << "\nWith: ";
1176 dbgs() << " and " << NumTo - 1 << " other values\n");
1177 for (unsigned i = 0, e = NumTo; i != e; ++i)
1178 assert((!To[i].getNode() ||
1179 N->getValueType(i) == To[i].getValueType()) &&
1180 "Cannot combine value to value of different type!");
1182 WorklistRemover DeadNodes(*this);
1183 DAG.ReplaceAllUsesWith(N, To);
1185 // Push the new nodes and any users onto the worklist
1186 for (unsigned i = 0, e = NumTo; i != e; ++i) {
1187 if (To[i].getNode())
1188 AddToWorklistWithUsers(To[i].getNode());
1192 // Finally, if the node is now dead, remove it from the graph. The node
1193 // may not be dead if the replacement process recursively simplified to
1194 // something else needing this node.
1196 deleteAndRecombine(N);
1197 return SDValue(N, 0);
1201 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
1202 // Replace the old value with the new one.
1204 LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.dump(&DAG);
1205 dbgs() << "\nWith: "; TLO.New.dump(&DAG); dbgs() << '\n');
1207 // Replace all uses.
1208 DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
1210 // Push the new node and any (possibly new) users onto the worklist.
1211 AddToWorklistWithUsers(TLO.New.getNode());
1213 // Finally, if the node is now dead, remove it from the graph.
1214 recursivelyDeleteUnusedNodes(TLO.Old.getNode());
1217 /// Check the specified integer node value to see if it can be simplified or if
1218 /// things it uses can be simplified by bit propagation. If so, return true.
1219 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
1220 const APInt &DemandedElts,
1221 bool AssumeSingleUse) {
1222 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
1224 if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
1228 // Revisit the node.
1229 AddToWorklist(Op.getNode());
1231 CommitTargetLoweringOpt(TLO);
1235 /// Check the specified vector node value to see if it can be simplified or
1236 /// if things it uses can be simplified as it only uses some of the elements.
1237 /// If so, return true.
1238 bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
1239 const APInt &DemandedElts,
1240 bool AssumeSingleUse) {
1241 TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
1242 APInt KnownUndef, KnownZero;
1243 if (!TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero,
1244 TLO, 0, AssumeSingleUse))
1247 // Revisit the node.
1248 AddToWorklist(Op.getNode());
1250 CommitTargetLoweringOpt(TLO);
1254 void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
1256 EVT VT = Load->getValueType(0);
1257 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, VT, SDValue(ExtLoad, 0));
1259 LLVM_DEBUG(dbgs() << "\nReplacing.9 "; Load->dump(&DAG); dbgs() << "\nWith: ";
1260 Trunc.dump(&DAG); dbgs() << '\n');
1262 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
1263 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
1265 AddToWorklist(Trunc.getNode());
1266 recursivelyDeleteUnusedNodes(Load);
1269 SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
1272 if (ISD::isUNINDEXEDLoad(Op.getNode())) {
1273 LoadSDNode *LD = cast<LoadSDNode>(Op);
1274 EVT MemVT = LD->getMemoryVT();
1275 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
1276 : LD->getExtensionType();
1278 return DAG.getExtLoad(ExtType, DL, PVT,
1279 LD->getChain(), LD->getBasePtr(),
1280 MemVT, LD->getMemOperand());
1283 unsigned Opc = Op.getOpcode();
1286 case ISD::AssertSext:
1287 if (SDValue Op0 = SExtPromoteOperand(Op.getOperand(0), PVT))
1288 return DAG.getNode(ISD::AssertSext, DL, PVT, Op0, Op.getOperand(1));
1290 case ISD::AssertZext:
1291 if (SDValue Op0 = ZExtPromoteOperand(Op.getOperand(0), PVT))
1292 return DAG.getNode(ISD::AssertZext, DL, PVT, Op0, Op.getOperand(1));
1294 case ISD::Constant: {
1296 Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
1297 return DAG.getNode(ExtOpc, DL, PVT, Op);
1301 if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
1303 return DAG.getNode(ISD::ANY_EXTEND, DL, PVT, Op);
1306 SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
1307 if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
1309 EVT OldVT = Op.getValueType();
1311 bool Replace = false;
1312 SDValue NewOp = PromoteOperand(Op, PVT, Replace);
1313 if (!NewOp.getNode())
1315 AddToWorklist(NewOp.getNode());
1318 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
1319 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, NewOp.getValueType(), NewOp,
1320 DAG.getValueType(OldVT));
1323 SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
1324 EVT OldVT = Op.getValueType();
1326 bool Replace = false;
1327 SDValue NewOp = PromoteOperand(Op, PVT, Replace);
1328 if (!NewOp.getNode())
1330 AddToWorklist(NewOp.getNode());
1333 ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
1334 return DAG.getZeroExtendInReg(NewOp, DL, OldVT);
1337 /// Promote the specified integer binary operation if the target indicates it is
1338 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to
1339 /// i32 since i16 instructions are longer.
1340 SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
1341 if (!LegalOperations)
1344 EVT VT = Op.getValueType();
1345 if (VT.isVector() || !VT.isInteger())
1348 // If operation type is 'undesirable', e.g. i16 on x86, consider
1350 unsigned Opc = Op.getOpcode();
1351 if (TLI.isTypeDesirableForOp(Opc, VT))
1355 // Consult target whether it is a good idea to promote this operation and
1356 // what's the right type to promote it to.
1357 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1358 assert(PVT != VT && "Don't know what type to promote to!");
1360 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
1362 bool Replace0 = false;
1363 SDValue N0 = Op.getOperand(0);
1364 SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
1366 bool Replace1 = false;
1367 SDValue N1 = Op.getOperand(1);
1368 SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
1372 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
1374 // We are always replacing N0/N1's use in N and only need additional
1375 // replacements if there are additional uses.
1376 // Note: We are checking uses of the *nodes* (SDNode) rather than values
1377 // (SDValue) here because the node may reference multiple values
1378 // (for example, the chain value of a load node).
1379 Replace0 &= !N0->hasOneUse();
1380 Replace1 &= (N0 != N1) && !N1->hasOneUse();
1382 // Combine Op here so it is preserved past replacements.
1383 CombineTo(Op.getNode(), RV);
1385 // If operands have a use ordering, make sure we deal with
1386 // predecessor first.
1387 if (Replace0 && Replace1 && N0->isPredecessorOf(N1.getNode())) {
1389 std::swap(NN0, NN1);
1393 AddToWorklist(NN0.getNode());
1394 ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
1397 AddToWorklist(NN1.getNode());
1398 ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
1405 /// Promote the specified integer shift operation if the target indicates it is
1406 /// beneficial. e.g. On x86, it's usually better to promote i16 operations to
1407 /// i32 since i16 instructions are longer.
1408 SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
1409 if (!LegalOperations)
1412 EVT VT = Op.getValueType();
1413 if (VT.isVector() || !VT.isInteger())
1416 // If operation type is 'undesirable', e.g. i16 on x86, consider
1418 unsigned Opc = Op.getOpcode();
1419 if (TLI.isTypeDesirableForOp(Opc, VT))
1423 // Consult target whether it is a good idea to promote this operation and
1424 // what's the right type to promote it to.
1425 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1426 assert(PVT != VT && "Don't know what type to promote to!");
1428 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
1430 bool Replace = false;
1431 SDValue N0 = Op.getOperand(0);
1432 if (Opc == ISD::SRA)
1433 N0 = SExtPromoteOperand(N0, PVT);
1434 else if (Opc == ISD::SRL)
1435 N0 = ZExtPromoteOperand(N0, PVT);
1437 N0 = PromoteOperand(N0, PVT, Replace);
1443 SDValue N1 = Op.getOperand(1);
1445 DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
1448 ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
1450 // Deal with Op being deleted.
1451 if (Op && Op.getOpcode() != ISD::DELETED_NODE)
1457 SDValue DAGCombiner::PromoteExtend(SDValue Op) {
1458 if (!LegalOperations)
1461 EVT VT = Op.getValueType();
1462 if (VT.isVector() || !VT.isInteger())
1465 // If operation type is 'undesirable', e.g. i16 on x86, consider
1467 unsigned Opc = Op.getOpcode();
1468 if (TLI.isTypeDesirableForOp(Opc, VT))
1472 // Consult target whether it is a good idea to promote this operation and
1473 // what's the right type to promote it to.
1474 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1475 assert(PVT != VT && "Don't know what type to promote to!");
1476 // fold (aext (aext x)) -> (aext x)
1477 // fold (aext (zext x)) -> (zext x)
1478 // fold (aext (sext x)) -> (sext x)
1479 LLVM_DEBUG(dbgs() << "\nPromoting "; Op.dump(&DAG));
1480 return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
1485 bool DAGCombiner::PromoteLoad(SDValue Op) {
1486 if (!LegalOperations)
1489 if (!ISD::isUNINDEXEDLoad(Op.getNode()))
1492 EVT VT = Op.getValueType();
1493 if (VT.isVector() || !VT.isInteger())
1496 // If operation type is 'undesirable', e.g. i16 on x86, consider
1498 unsigned Opc = Op.getOpcode();
1499 if (TLI.isTypeDesirableForOp(Opc, VT))
1503 // Consult target whether it is a good idea to promote this operation and
1504 // what's the right type to promote it to.
1505 if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
1506 assert(PVT != VT && "Don't know what type to promote to!");
1509 SDNode *N = Op.getNode();
1510 LoadSDNode *LD = cast<LoadSDNode>(N);
1511 EVT MemVT = LD->getMemoryVT();
1512 ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD) ? ISD::EXTLOAD
1513 : LD->getExtensionType();
1514 SDValue NewLD = DAG.getExtLoad(ExtType, DL, PVT,
1515 LD->getChain(), LD->getBasePtr(),
1516 MemVT, LD->getMemOperand());
1517 SDValue Result = DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
1519 LLVM_DEBUG(dbgs() << "\nPromoting "; N->dump(&DAG); dbgs() << "\nTo: ";
1520 Result.dump(&DAG); dbgs() << '\n');
1522 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1523 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
1525 AddToWorklist(Result.getNode());
1526 recursivelyDeleteUnusedNodes(N);
1533 /// Recursively delete a node which has no uses and any operands for
1534 /// which it is the only use.
1536 /// Note that this both deletes the nodes and removes them from the worklist.
1537 /// It also adds any nodes who have had a user deleted to the worklist as they
1538 /// may now have only one use and subject to other combines.
1539 bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
1540 if (!N->use_empty())
1543 SmallSetVector<SDNode *, 16> Nodes;
1546 N = Nodes.pop_back_val();
1550 if (N->use_empty()) {
1551 for (const SDValue &ChildN : N->op_values())
1552 Nodes.insert(ChildN.getNode());
1554 removeFromWorklist(N);
1559 } while (!Nodes.empty());
1563 //===----------------------------------------------------------------------===//
1564 // Main DAG Combiner implementation
1565 //===----------------------------------------------------------------------===//
1567 void DAGCombiner::Run(CombineLevel AtLevel) {
1568 // set the instance variables, so that the various visit routines may use it.
1570 LegalDAG = Level >= AfterLegalizeDAG;
1571 LegalOperations = Level >= AfterLegalizeVectorOps;
1572 LegalTypes = Level >= AfterLegalizeTypes;
1574 WorklistInserter AddNodes(*this);
1576 // Add all the dag nodes to the worklist.
1577 for (SDNode &Node : DAG.allnodes())
1578 AddToWorklist(&Node);
1580 // Create a dummy node (which is not added to allnodes), that adds a reference
1581 // to the root node, preventing it from being deleted, and tracking any
1582 // changes of the root.
1583 HandleSDNode Dummy(DAG.getRoot());
1585 // While we have a valid worklist entry node, try to combine it.
1586 while (SDNode *N = getNextWorklistEntry()) {
1587 // If N has no uses, it is dead. Make sure to revisit all N's operands once
1588 // N is deleted from the DAG, since they too may now be dead or may have a
1589 // reduced number of uses, allowing other xforms.
1590 if (recursivelyDeleteUnusedNodes(N))
1593 WorklistRemover DeadNodes(*this);
1595 // If this combine is running after legalizing the DAG, re-legalize any
1596 // nodes pulled off the worklist.
1598 SmallSetVector<SDNode *, 16> UpdatedNodes;
1599 bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
1601 for (SDNode *LN : UpdatedNodes)
1602 AddToWorklistWithUsers(LN);
1608 LLVM_DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
1610 // Add any operands of the new node which have not yet been combined to the
1611 // worklist as well. Because the worklist uniques things already, this
1612 // won't repeatedly process the same operand.
1613 CombinedNodes.insert(N);
1614 for (const SDValue &ChildN : N->op_values())
1615 if (!CombinedNodes.count(ChildN.getNode()))
1616 AddToWorklist(ChildN.getNode());
1618 SDValue RV = combine(N);
1625 // If we get back the same node we passed in, rather than a new node or
1626 // zero, we know that the node must have defined multiple values and
1627 // CombineTo was used. Since CombineTo takes care of the worklist
1628 // mechanics for us, we have no work to do in this case.
1629 if (RV.getNode() == N)
1632 assert(N->getOpcode() != ISD::DELETED_NODE &&
1633 RV.getOpcode() != ISD::DELETED_NODE &&
1634 "Node was deleted but visit returned new node!");
1636 LLVM_DEBUG(dbgs() << " ... into: "; RV.dump(&DAG));
1638 if (N->getNumValues() == RV->getNumValues())
1639 DAG.ReplaceAllUsesWith(N, RV.getNode());
1641 assert(N->getValueType(0) == RV.getValueType() &&
1642 N->getNumValues() == 1 && "Type mismatch");
1643 DAG.ReplaceAllUsesWith(N, &RV);
1646 // Push the new node and any users onto the worklist. Omit this if the
1647 // new node is the EntryToken (e.g. if a store managed to get optimized
1648 // out), because re-visiting the EntryToken and its users will not uncover
1649 // any additional opportunities, but there may be a large number of such
1650 // users, potentially causing compile time explosion.
1651 if (RV.getOpcode() != ISD::EntryToken) {
1652 AddToWorklist(RV.getNode());
1653 AddUsersToWorklist(RV.getNode());
1656 // Finally, if the node is now dead, remove it from the graph. The node
1657 // may not be dead if the replacement process recursively simplified to
1658 // something else needing this node. This will also take care of adding any
1659 // operands which have lost a user to the worklist.
1660 recursivelyDeleteUnusedNodes(N);
1663 // If the root changed (e.g. it was a dead load, update the root).
1664 DAG.setRoot(Dummy.getValue());
1665 DAG.RemoveDeadNodes();
1668 SDValue DAGCombiner::visit(SDNode *N) {
1669 switch (N->getOpcode()) {
1671 case ISD::TokenFactor: return visitTokenFactor(N);
1672 case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
1673 case ISD::ADD: return visitADD(N);
1674 case ISD::SUB: return visitSUB(N);
1676 case ISD::UADDSAT: return visitADDSAT(N);
1678 case ISD::USUBSAT: return visitSUBSAT(N);
1679 case ISD::ADDC: return visitADDC(N);
1681 case ISD::UADDO: return visitADDO(N);
1682 case ISD::SUBC: return visitSUBC(N);
1684 case ISD::USUBO: return visitSUBO(N);
1685 case ISD::ADDE: return visitADDE(N);
1686 case ISD::ADDCARRY: return visitADDCARRY(N);
1687 case ISD::SADDO_CARRY: return visitSADDO_CARRY(N);
1688 case ISD::SUBE: return visitSUBE(N);
1689 case ISD::SUBCARRY: return visitSUBCARRY(N);
1690 case ISD::SSUBO_CARRY: return visitSSUBO_CARRY(N);
1692 case ISD::SMULFIXSAT:
1694 case ISD::UMULFIXSAT: return visitMULFIX(N);
1695 case ISD::MUL: return visitMUL(N);
1696 case ISD::SDIV: return visitSDIV(N);
1697 case ISD::UDIV: return visitUDIV(N);
1699 case ISD::UREM: return visitREM(N);
1700 case ISD::MULHU: return visitMULHU(N);
1701 case ISD::MULHS: return visitMULHS(N);
1702 case ISD::AVGFLOORS:
1703 case ISD::AVGFLOORU:
1705 case ISD::AVGCEILU: return visitAVG(N);
1706 case ISD::SMUL_LOHI: return visitSMUL_LOHI(N);
1707 case ISD::UMUL_LOHI: return visitUMUL_LOHI(N);
1709 case ISD::UMULO: return visitMULO(N);
1713 case ISD::UMAX: return visitIMINMAX(N);
1714 case ISD::AND: return visitAND(N);
1715 case ISD::OR: return visitOR(N);
1716 case ISD::XOR: return visitXOR(N);
1717 case ISD::SHL: return visitSHL(N);
1718 case ISD::SRA: return visitSRA(N);
1719 case ISD::SRL: return visitSRL(N);
1721 case ISD::ROTL: return visitRotate(N);
1723 case ISD::FSHR: return visitFunnelShift(N);
1725 case ISD::USHLSAT: return visitSHLSAT(N);
1726 case ISD::ABS: return visitABS(N);
1727 case ISD::BSWAP: return visitBSWAP(N);
1728 case ISD::BITREVERSE: return visitBITREVERSE(N);
1729 case ISD::CTLZ: return visitCTLZ(N);
1730 case ISD::CTLZ_ZERO_UNDEF: return visitCTLZ_ZERO_UNDEF(N);
1731 case ISD::CTTZ: return visitCTTZ(N);
1732 case ISD::CTTZ_ZERO_UNDEF: return visitCTTZ_ZERO_UNDEF(N);
1733 case ISD::CTPOP: return visitCTPOP(N);
1734 case ISD::SELECT: return visitSELECT(N);
1735 case ISD::VSELECT: return visitVSELECT(N);
1736 case ISD::SELECT_CC: return visitSELECT_CC(N);
1737 case ISD::SETCC: return visitSETCC(N);
1738 case ISD::SETCCCARRY: return visitSETCCCARRY(N);
1739 case ISD::SIGN_EXTEND: return visitSIGN_EXTEND(N);
1740 case ISD::ZERO_EXTEND: return visitZERO_EXTEND(N);
1741 case ISD::ANY_EXTEND: return visitANY_EXTEND(N);
1742 case ISD::AssertSext:
1743 case ISD::AssertZext: return visitAssertExt(N);
1744 case ISD::AssertAlign: return visitAssertAlign(N);
1745 case ISD::SIGN_EXTEND_INREG: return visitSIGN_EXTEND_INREG(N);
1746 case ISD::SIGN_EXTEND_VECTOR_INREG:
1747 case ISD::ZERO_EXTEND_VECTOR_INREG: return visitEXTEND_VECTOR_INREG(N);
1748 case ISD::TRUNCATE: return visitTRUNCATE(N);
1749 case ISD::BITCAST: return visitBITCAST(N);
1750 case ISD::BUILD_PAIR: return visitBUILD_PAIR(N);
1751 case ISD::FADD: return visitFADD(N);
1752 case ISD::STRICT_FADD: return visitSTRICT_FADD(N);
1753 case ISD::FSUB: return visitFSUB(N);
1754 case ISD::FMUL: return visitFMUL(N);
1755 case ISD::FMA: return visitFMA(N);
1756 case ISD::FDIV: return visitFDIV(N);
1757 case ISD::FREM: return visitFREM(N);
1758 case ISD::FSQRT: return visitFSQRT(N);
1759 case ISD::FCOPYSIGN: return visitFCOPYSIGN(N);
1760 case ISD::FPOW: return visitFPOW(N);
1761 case ISD::SINT_TO_FP: return visitSINT_TO_FP(N);
1762 case ISD::UINT_TO_FP: return visitUINT_TO_FP(N);
1763 case ISD::FP_TO_SINT: return visitFP_TO_SINT(N);
1764 case ISD::FP_TO_UINT: return visitFP_TO_UINT(N);
1765 case ISD::FP_ROUND: return visitFP_ROUND(N);
1766 case ISD::FP_EXTEND: return visitFP_EXTEND(N);
1767 case ISD::FNEG: return visitFNEG(N);
1768 case ISD::FABS: return visitFABS(N);
1769 case ISD::FFLOOR: return visitFFLOOR(N);
1773 case ISD::FMAXIMUM: return visitFMinMax(N);
1774 case ISD::FCEIL: return visitFCEIL(N);
1775 case ISD::FTRUNC: return visitFTRUNC(N);
1776 case ISD::BRCOND: return visitBRCOND(N);
1777 case ISD::BR_CC: return visitBR_CC(N);
1778 case ISD::LOAD: return visitLOAD(N);
1779 case ISD::STORE: return visitSTORE(N);
1780 case ISD::INSERT_VECTOR_ELT: return visitINSERT_VECTOR_ELT(N);
1781 case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
1782 case ISD::BUILD_VECTOR: return visitBUILD_VECTOR(N);
1783 case ISD::CONCAT_VECTORS: return visitCONCAT_VECTORS(N);
1784 case ISD::EXTRACT_SUBVECTOR: return visitEXTRACT_SUBVECTOR(N);
1785 case ISD::VECTOR_SHUFFLE: return visitVECTOR_SHUFFLE(N);
1786 case ISD::SCALAR_TO_VECTOR: return visitSCALAR_TO_VECTOR(N);
1787 case ISD::INSERT_SUBVECTOR: return visitINSERT_SUBVECTOR(N);
1788 case ISD::MGATHER: return visitMGATHER(N);
1789 case ISD::MLOAD: return visitMLOAD(N);
1790 case ISD::MSCATTER: return visitMSCATTER(N);
1791 case ISD::MSTORE: return visitMSTORE(N);
1792 case ISD::LIFETIME_END: return visitLIFETIME_END(N);
1793 case ISD::FP_TO_FP16: return visitFP_TO_FP16(N);
1794 case ISD::FP16_TO_FP: return visitFP16_TO_FP(N);
1795 case ISD::FP_TO_BF16: return visitFP_TO_BF16(N);
1796 case ISD::FREEZE: return visitFREEZE(N);
1797 case ISD::VECREDUCE_FADD:
1798 case ISD::VECREDUCE_FMUL:
1799 case ISD::VECREDUCE_ADD:
1800 case ISD::VECREDUCE_MUL:
1801 case ISD::VECREDUCE_AND:
1802 case ISD::VECREDUCE_OR:
1803 case ISD::VECREDUCE_XOR:
1804 case ISD::VECREDUCE_SMAX:
1805 case ISD::VECREDUCE_SMIN:
1806 case ISD::VECREDUCE_UMAX:
1807 case ISD::VECREDUCE_UMIN:
1808 case ISD::VECREDUCE_FMAX:
1809 case ISD::VECREDUCE_FMIN: return visitVECREDUCE(N);
1810 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...) case ISD::SDOPC:
1811 #include "llvm/IR/VPIntrinsics.def"
1812 return visitVPOp(N);
1817 SDValue DAGCombiner::combine(SDNode *N) {
1819 if (!DisableGenericCombines)
1822 // If nothing happened, try a target-specific DAG combine.
1823 if (!RV.getNode()) {
1824 assert(N->getOpcode() != ISD::DELETED_NODE &&
1825 "Node was deleted but visit returned NULL!");
1827 if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
1828 TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
1830 // Expose the DAG combiner to the target combiner impls.
1831 TargetLowering::DAGCombinerInfo
1832 DagCombineInfo(DAG, Level, false, this);
1834 RV = TLI.PerformDAGCombine(N, DagCombineInfo);
1838 // If nothing happened still, try promoting the operation.
1839 if (!RV.getNode()) {
1840 switch (N->getOpcode()) {
1848 RV = PromoteIntBinOp(SDValue(N, 0));
1853 RV = PromoteIntShiftOp(SDValue(N, 0));
1855 case ISD::SIGN_EXTEND:
1856 case ISD::ZERO_EXTEND:
1857 case ISD::ANY_EXTEND:
1858 RV = PromoteExtend(SDValue(N, 0));
1861 if (PromoteLoad(SDValue(N, 0)))
1867 // If N is a commutative binary node, try to eliminate it if the commuted
1868 // version is already present in the DAG.
1869 if (!RV.getNode() && TLI.isCommutativeBinOp(N->getOpcode())) {
1870 SDValue N0 = N->getOperand(0);
1871 SDValue N1 = N->getOperand(1);
1873 // Constant operands are canonicalized to RHS.
1874 if (N0 != N1 && (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1))) {
1875 SDValue Ops[] = {N1, N0};
1876 SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
1879 return SDValue(CSENode, 0);
1886 /// Given a node, return its input chain if it has one, otherwise return a null
1888 static SDValue getInputChainForNode(SDNode *N) {
1889 if (unsigned NumOps = N->getNumOperands()) {
1890 if (N->getOperand(0).getValueType() == MVT::Other)
1891 return N->getOperand(0);
1892 if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
1893 return N->getOperand(NumOps-1);
1894 for (unsigned i = 1; i < NumOps-1; ++i)
1895 if (N->getOperand(i).getValueType() == MVT::Other)
1896 return N->getOperand(i);
1901 SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
1902 // If N has two operands, where one has an input chain equal to the other,
1903 // the 'other' chain is redundant.
1904 if (N->getNumOperands() == 2) {
1905 if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
1906 return N->getOperand(0);
1907 if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
1908 return N->getOperand(1);
1911 // Don't simplify token factors if optnone.
1912 if (OptLevel == CodeGenOpt::None)
1915 // Don't simplify the token factor if the node itself has too many operands.
1916 if (N->getNumOperands() > TokenFactorInlineLimit)
1919 // If the sole user is a token factor, we should make sure we have a
1920 // chance to merge them together. This prevents TF chains from inhibiting
1922 if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor)
1923 AddToWorklist(*(N->use_begin()));
1925 SmallVector<SDNode *, 8> TFs; // List of token factors to visit.
1926 SmallVector<SDValue, 8> Ops; // Ops for replacing token factor.
1927 SmallPtrSet<SDNode*, 16> SeenOps;
1928 bool Changed = false; // If we should replace this token factor.
1930 // Start out with this token factor.
1933 // Iterate through token factors. The TFs grows when new token factors are
1935 for (unsigned i = 0; i < TFs.size(); ++i) {
1936 // Limit number of nodes to inline, to avoid quadratic compile times.
1937 // We have to add the outstanding Token Factors to Ops, otherwise we might
1938 // drop Ops from the resulting Token Factors.
1939 if (Ops.size() > TokenFactorInlineLimit) {
1940 for (unsigned j = i; j < TFs.size(); j++)
1941 Ops.emplace_back(TFs[j], 0);
1942 // Drop unprocessed Token Factors from TFs, so we do not add them to the
1943 // combiner worklist later.
1948 SDNode *TF = TFs[i];
1949 // Check each of the operands.
1950 for (const SDValue &Op : TF->op_values()) {
1951 switch (Op.getOpcode()) {
1952 case ISD::EntryToken:
1953 // Entry tokens don't need to be added to the list. They are
1958 case ISD::TokenFactor:
1959 if (Op.hasOneUse() && !is_contained(TFs, Op.getNode())) {
1960 // Queue up for processing.
1961 TFs.push_back(Op.getNode());
1968 // Only add if it isn't already in the list.
1969 if (SeenOps.insert(Op.getNode()).second)
1978 // Re-visit inlined Token Factors, to clean them up in case they have been
1979 // removed. Skip the first Token Factor, as this is the current node.
1980 for (unsigned i = 1, e = TFs.size(); i < e; i++)
1981 AddToWorklist(TFs[i]);
1983 // Remove Nodes that are chained to another node in the list. Do so
1984 // by walking up chains breath-first stopping when we've seen
1985 // another operand. In general we must climb to the EntryNode, but we can exit
1986 // early if we find all remaining work is associated with just one operand as
1987 // no further pruning is possible.
1989 // List of nodes to search through and original Ops from which they originate.
1990 SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
1991 SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
1992 SmallPtrSet<SDNode *, 16> SeenChains;
1993 bool DidPruneOps = false;
1995 unsigned NumLeftToConsider = 0;
1996 for (const SDValue &Op : Ops) {
1997 Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
1998 OpWorkCount.push_back(1);
2001 auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
2002 // If this is an Op, we can remove the op from the list. Remark any
2003 // search associated with it as from the current OpNumber.
2004 if (SeenOps.contains(Op)) {
2007 unsigned OrigOpNumber = 0;
2008 while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
2010 assert((OrigOpNumber != Ops.size()) &&
2011 "expected to find TokenFactor Operand");
2012 // Re-mark worklist from OrigOpNumber to OpNumber
2013 for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
2014 if (Worklist[i].second == OrigOpNumber) {
2015 Worklist[i].second = OpNumber;
2018 OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
2019 OpWorkCount[OrigOpNumber] = 0;
2020 NumLeftToConsider--;
2022 // Add if it's a new chain
2023 if (SeenChains.insert(Op).second) {
2024 OpWorkCount[OpNumber]++;
2025 Worklist.push_back(std::make_pair(Op, OpNumber));
2029 for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
2030 // We need at least be consider at least 2 Ops to prune.
2031 if (NumLeftToConsider <= 1)
2033 auto CurNode = Worklist[i].first;
2034 auto CurOpNumber = Worklist[i].second;
2035 assert((OpWorkCount[CurOpNumber] > 0) &&
2036 "Node should not appear in worklist");
2037 switch (CurNode->getOpcode()) {
2038 case ISD::EntryToken:
2039 // Hitting EntryToken is the only way for the search to terminate without
2041 // another operand's search. Prevent us from marking this operand
2043 NumLeftToConsider++;
2045 case ISD::TokenFactor:
2046 for (const SDValue &Op : CurNode->op_values())
2047 AddToWorklist(i, Op.getNode(), CurOpNumber);
2049 case ISD::LIFETIME_START:
2050 case ISD::LIFETIME_END:
2051 case ISD::CopyFromReg:
2052 case ISD::CopyToReg:
2053 AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
2056 if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
2057 AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
2060 OpWorkCount[CurOpNumber]--;
2061 if (OpWorkCount[CurOpNumber] == 0)
2062 NumLeftToConsider--;
2065 // If we've changed things around then replace token factor.
2069 // The entry token is the only possible outcome.
2070 Result = DAG.getEntryNode();
2073 SmallVector<SDValue, 8> PrunedOps;
2075 for (const SDValue &Op : Ops) {
2076 if (SeenChains.count(Op.getNode()) == 0)
2077 PrunedOps.push_back(Op);
2079 Result = DAG.getTokenFactor(SDLoc(N), PrunedOps);
2081 Result = DAG.getTokenFactor(SDLoc(N), Ops);
2089 /// MERGE_VALUES can always be eliminated.
2090 SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
2091 WorklistRemover DeadNodes(*this);
2092 // Replacing results may cause a different MERGE_VALUES to suddenly
2093 // be CSE'd with N, and carry its uses with it. Iterate until no
2094 // uses remain, to ensure that the node can be safely deleted.
2095 // First add the users of this node to the work list so that they
2096 // can be tried again once they have new operands.
2097 AddUsersToWorklist(N);
2099 // Do as a single replacement to avoid rewalking use lists.
2100 SmallVector<SDValue, 8> Ops;
2101 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
2102 Ops.push_back(N->getOperand(i));
2103 DAG.ReplaceAllUsesWith(N, Ops.data());
2104 } while (!N->use_empty());
2105 deleteAndRecombine(N);
2106 return SDValue(N, 0); // Return N so it doesn't get rechecked!
2109 /// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
2110 /// ConstantSDNode pointer else nullptr.
2111 static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
2112 ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
2113 return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
2116 /// Return true if 'Use' is a load or a store that uses N as its base pointer
2117 /// and that N may be folded in the load / store addressing mode.
2118 static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
2119 const TargetLowering &TLI) {
2123 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Use)) {
2124 if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
2126 VT = LD->getMemoryVT();
2127 AS = LD->getAddressSpace();
2128 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(Use)) {
2129 if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
2131 VT = ST->getMemoryVT();
2132 AS = ST->getAddressSpace();
2133 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(Use)) {
2134 if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
2136 VT = LD->getMemoryVT();
2137 AS = LD->getAddressSpace();
2138 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(Use)) {
2139 if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
2141 VT = ST->getMemoryVT();
2142 AS = ST->getAddressSpace();
2147 TargetLowering::AddrMode AM;
2148 if (N->getOpcode() == ISD::ADD) {
2149 AM.HasBaseReg = true;
2150 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
2153 AM.BaseOffs = Offset->getSExtValue();
2157 } else if (N->getOpcode() == ISD::SUB) {
2158 AM.HasBaseReg = true;
2159 ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
2162 AM.BaseOffs = -Offset->getSExtValue();
2170 return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
2171 VT.getTypeForEVT(*DAG.getContext()), AS);
2174 /// This inverts a canonicalization in IR that replaces a variable select arm
2175 /// with an identity constant. Codegen improves if we re-use the variable
2176 /// operand rather than load a constant. This can also be converted into a
2177 /// masked vector operation if the target supports it.
2178 static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
2179 bool ShouldCommuteOperands) {
2180 // Match a select as operand 1. The identity constant that we are looking for
2181 // is only valid as operand 1 of a non-commutative binop.
2182 SDValue N0 = N->getOperand(0);
2183 SDValue N1 = N->getOperand(1);
2184 if (ShouldCommuteOperands)
2187 // TODO: Should this apply to scalar select too?
2188 if (N1.getOpcode() != ISD::VSELECT || !N1.hasOneUse())
2191 unsigned Opcode = N->getOpcode();
2192 EVT VT = N->getValueType(0);
2193 SDValue Cond = N1.getOperand(0);
2194 SDValue TVal = N1.getOperand(1);
2195 SDValue FVal = N1.getOperand(2);
2197 // TODO: The cases should match with IR's ConstantExpr::getBinOpIdentity().
2198 // TODO: Target-specific opcodes could be added. Ex: "isCommutativeBinOp()".
2199 // TODO: With fast-math (NSZ), allow the opposite-sign form of zero?
2200 auto isIdentityConstantForOpcode = [](unsigned Opcode, SDValue V) {
2201 if (ConstantFPSDNode *C = isConstOrConstSplatFP(V)) {
2203 case ISD::FADD: // X + -0.0 --> X
2204 return C->isZero() && C->isNegative();
2205 case ISD::FSUB: // X - 0.0 --> X
2206 return C->isZero() && !C->isNegative();
2207 case ISD::FMUL: // X * 1.0 --> X
2208 case ISD::FDIV: // X / 1.0 --> X
2209 return C->isExactlyValue(1.0);
2212 if (ConstantSDNode *C = isConstOrConstSplat(V)) {
2214 case ISD::ADD: // X + 0 --> X
2215 case ISD::SUB: // X - 0 --> X
2216 case ISD::SHL: // X << 0 --> X
2217 case ISD::SRA: // X s>> 0 --> X
2218 case ISD::SRL: // X u>> 0 --> X
2220 case ISD::MUL: // X * 1 --> X
2227 // This transform increases uses of N0, so freeze it to be safe.
2228 // binop N0, (vselect Cond, IDC, FVal) --> vselect Cond, N0, (binop N0, FVal)
2229 if (isIdentityConstantForOpcode(Opcode, TVal)) {
2230 SDValue F0 = DAG.getFreeze(N0);
2231 SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, FVal, N->getFlags());
2232 return DAG.getSelect(SDLoc(N), VT, Cond, F0, NewBO);
2234 // binop N0, (vselect Cond, TVal, IDC) --> vselect Cond, (binop N0, TVal), N0
2235 if (isIdentityConstantForOpcode(Opcode, FVal)) {
2236 SDValue F0 = DAG.getFreeze(N0);
2237 SDValue NewBO = DAG.getNode(Opcode, SDLoc(N), VT, F0, TVal, N->getFlags());
2238 return DAG.getSelect(SDLoc(N), VT, Cond, NewBO, F0);
2244 SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
2245 assert(TLI.isBinOp(BO->getOpcode()) && BO->getNumValues() == 1 &&
2246 "Unexpected binary operator");
2248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2249 auto BinOpcode = BO->getOpcode();
2250 EVT VT = BO->getValueType(0);
2251 if (TLI.shouldFoldSelectWithIdentityConstant(BinOpcode, VT)) {
2252 if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, false))
2255 if (TLI.isCommutativeBinOp(BO->getOpcode()))
2256 if (SDValue Sel = foldSelectWithIdentityConstant(BO, DAG, true))
2260 // Don't do this unless the old select is going away. We want to eliminate the
2261 // binary operator, not replace a binop with a select.
2262 // TODO: Handle ISD::SELECT_CC.
2263 unsigned SelOpNo = 0;
2264 SDValue Sel = BO->getOperand(0);
2265 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse()) {
2267 Sel = BO->getOperand(1);
2270 if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
2273 SDValue CT = Sel.getOperand(1);
2274 if (!isConstantOrConstantVector(CT, true) &&
2275 !DAG.isConstantFPBuildVectorOrConstantFP(CT))
2278 SDValue CF = Sel.getOperand(2);
2279 if (!isConstantOrConstantVector(CF, true) &&
2280 !DAG.isConstantFPBuildVectorOrConstantFP(CF))
2283 // Bail out if any constants are opaque because we can't constant fold those.
2284 // The exception is "and" and "or" with either 0 or -1 in which case we can
2285 // propagate non constant operands into select. I.e.:
2286 // and (select Cond, 0, -1), X --> select Cond, 0, X
2287 // or X, (select Cond, -1, 0) --> select Cond, -1, X
2288 bool CanFoldNonConst =
2289 (BinOpcode == ISD::AND || BinOpcode == ISD::OR) &&
2290 (isNullOrNullSplat(CT) || isAllOnesOrAllOnesSplat(CT)) &&
2291 (isNullOrNullSplat(CF) || isAllOnesOrAllOnesSplat(CF));
2293 SDValue CBO = BO->getOperand(SelOpNo ^ 1);
2294 if (!CanFoldNonConst &&
2295 !isConstantOrConstantVector(CBO, true) &&
2296 !DAG.isConstantFPBuildVectorOrConstantFP(CBO))
2299 // We have a select-of-constants followed by a binary operator with a
2300 // constant. Eliminate the binop by pulling the constant math into the select.
2301 // Example: add (select Cond, CT, CF), CBO --> select Cond, CT + CBO, CF + CBO
2303 SDValue NewCT = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CT)
2304 : DAG.getNode(BinOpcode, DL, VT, CT, CBO);
2305 if (!CanFoldNonConst && !NewCT.isUndef() &&
2306 !isConstantOrConstantVector(NewCT, true) &&
2307 !DAG.isConstantFPBuildVectorOrConstantFP(NewCT))
2310 SDValue NewCF = SelOpNo ? DAG.getNode(BinOpcode, DL, VT, CBO, CF)
2311 : DAG.getNode(BinOpcode, DL, VT, CF, CBO);
2312 if (!CanFoldNonConst && !NewCF.isUndef() &&
2313 !isConstantOrConstantVector(NewCF, true) &&
2314 !DAG.isConstantFPBuildVectorOrConstantFP(NewCF))
2317 SDValue SelectOp = DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
2318 SelectOp->setFlags(BO->getFlags());
2322 static SDValue foldAddSubBoolOfMaskedVal(SDNode *N, SelectionDAG &DAG) {
2323 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
2324 "Expecting add or sub");
2326 // Match a constant operand and a zext operand for the math instruction:
2329 bool IsAdd = N->getOpcode() == ISD::ADD;
2330 SDValue C = IsAdd ? N->getOperand(1) : N->getOperand(0);
2331 SDValue Z = IsAdd ? N->getOperand(0) : N->getOperand(1);
2332 auto *CN = dyn_cast<ConstantSDNode>(C);
2333 if (!CN || Z.getOpcode() != ISD::ZERO_EXTEND)
2336 // Match the zext operand as a setcc of a boolean.
2337 if (Z.getOperand(0).getOpcode() != ISD::SETCC ||
2338 Z.getOperand(0).getValueType() != MVT::i1)
2341 // Match the compare as: setcc (X & 1), 0, eq.
2342 SDValue SetCC = Z.getOperand(0);
2343 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
2344 if (CC != ISD::SETEQ || !isNullConstant(SetCC.getOperand(1)) ||
2345 SetCC.getOperand(0).getOpcode() != ISD::AND ||
2346 !isOneConstant(SetCC.getOperand(0).getOperand(1)))
2349 // We are adding/subtracting a constant and an inverted low bit. Turn that
2350 // into a subtract/add of the low bit with incremented/decremented constant:
2351 // add (zext i1 (seteq (X & 1), 0)), C --> sub C+1, (zext (X & 1))
2352 // sub C, (zext i1 (seteq (X & 1), 0)) --> add C-1, (zext (X & 1))
2353 EVT VT = C.getValueType();
2355 SDValue LowBit = DAG.getZExtOrTrunc(SetCC.getOperand(0), DL, VT);
2356 SDValue C1 = IsAdd ? DAG.getConstant(CN->getAPIntValue() + 1, DL, VT) :
2357 DAG.getConstant(CN->getAPIntValue() - 1, DL, VT);
2358 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, C1, LowBit);
2361 /// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
2362 /// a shift and add with a different constant.
2363 static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
2364 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
2365 "Expecting add or sub");
2367 // We need a constant operand for the add/sub, and the other operand is a
2368 // logical shift right: add (srl), C or sub C, (srl).
2369 bool IsAdd = N->getOpcode() == ISD::ADD;
2370 SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
2371 SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
2372 if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) ||
2373 ShiftOp.getOpcode() != ISD::SRL)
2376 // The shift must be of a 'not' value.
2377 SDValue Not = ShiftOp.getOperand(0);
2378 if (!Not.hasOneUse() || !isBitwiseNot(Not))
2381 // The shift must be moving the sign bit to the least-significant-bit.
2382 EVT VT = ShiftOp.getValueType();
2383 SDValue ShAmt = ShiftOp.getOperand(1);
2384 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
2385 if (!ShAmtC || ShAmtC->getAPIntValue() != (VT.getScalarSizeInBits() - 1))
2388 // Eliminate the 'not' by adjusting the shift and add/sub constant:
2389 // add (srl (not X), 31), C --> add (sra X, 31), (C + 1)
2390 // sub C, (srl (not X), 31) --> add (srl X, 31), (C - 1)
2392 if (SDValue NewC = DAG.FoldConstantArithmetic(
2393 IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
2394 {ConstantOp, DAG.getConstant(1, DL, VT)})) {
2395 SDValue NewShift = DAG.getNode(IsAdd ? ISD::SRA : ISD::SRL, DL, VT,
2396 Not.getOperand(0), ShAmt);
2397 return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC);
2403 static bool isADDLike(SDValue V, const SelectionDAG &DAG) {
2404 unsigned Opcode = V.getOpcode();
2405 if (Opcode == ISD::OR)
2406 return DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1));
2407 if (Opcode == ISD::XOR)
2408 return isMinSignedConstant(V.getOperand(1));
2412 /// Try to fold a node that behaves like an ADD (note that N isn't necessarily
2413 /// an ISD::ADD here, it could for example be an ISD::OR if we know that there
2414 /// are no common bits set in the operands).
2415 SDValue DAGCombiner::visitADDLike(SDNode *N) {
2416 SDValue N0 = N->getOperand(0);
2417 SDValue N1 = N->getOperand(1);
2418 EVT VT = N0.getValueType();
2421 // fold (add x, undef) -> undef
2427 // fold (add c1, c2) -> c1+c2
2428 if (SDValue C = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1}))
2431 // canonicalize constant to RHS
2432 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
2433 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
2434 return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
2437 if (VT.isVector()) {
2438 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
2441 // fold (add x, 0) -> x, vector edition
2442 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
2446 // fold (add x, 0) -> x
2447 if (isNullConstant(N1))
2450 if (N0.getOpcode() == ISD::SUB) {
2451 SDValue N00 = N0.getOperand(0);
2452 SDValue N01 = N0.getOperand(1);
2454 // fold ((A-c1)+c2) -> (A+(c2-c1))
2455 if (SDValue Sub = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N01}))
2456 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);
2458 // fold ((c1-A)+c2) -> (c1+c2)-A
2459 if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N00}))
2460 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
2463 // add (sext i1 X), 1 -> zext (not i1 X)
2464 // We don't transform this pattern:
2465 // add (zext i1 X), -1 -> sext (not i1 X)
2466 // because most (?) targets generate better code for the zext form.
2467 if (N0.getOpcode() == ISD::SIGN_EXTEND && N0.hasOneUse() &&
2468 isOneOrOneSplat(N1)) {
2469 SDValue X = N0.getOperand(0);
2470 if ((!LegalOperations ||
2471 (TLI.isOperationLegal(ISD::XOR, X.getValueType()) &&
2472 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) &&
2473 X.getScalarValueSizeInBits() == 1) {
2474 SDValue Not = DAG.getNOT(DL, X, X.getValueType());
2475 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
2479 // Fold (add (or x, c0), c1) -> (add x, (c0 + c1))
2480 // iff (or x, c0) is equivalent to (add x, c0).
2481 // Fold (add (xor x, c0), c1) -> (add x, (c0 + c1))
2482 // iff (xor x, c0) is equivalent to (add x, c0).
2483 if (isADDLike(N0, DAG)) {
2484 SDValue N01 = N0.getOperand(1);
2485 if (SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N01}))
2486 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add);
2489 if (SDValue NewSel = foldBinOpIntoSelect(N))
2493 if (!reassociationCanBreakAddressingModePattern(ISD::ADD, DL, N, N0, N1)) {
2494 if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
2497 // Reassociate (add (or x, c), y) -> (add add(x, y), c)) if (or x, c) is
2498 // equivalent to (add x, c).
2499 // Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is
2500 // equivalent to (add x, c).
2501 auto ReassociateAddOr = [&](SDValue N0, SDValue N1) {
2502 if (isADDLike(N0, DAG) && N0.hasOneUse() &&
2503 isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
2504 return DAG.getNode(ISD::ADD, DL, VT,
2505 DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(0)),
2510 if (SDValue Add = ReassociateAddOr(N0, N1))
2512 if (SDValue Add = ReassociateAddOr(N1, N0))
2515 // fold ((0-A) + B) -> B-A
2516 if (N0.getOpcode() == ISD::SUB && isNullOrNullSplat(N0.getOperand(0)))
2517 return DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
2519 // fold (A + (0-B)) -> A-B
2520 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
2521 return DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(1));
2523 // fold (A+(B-A)) -> B
2524 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
2525 return N1.getOperand(0);
2527 // fold ((B-A)+A) -> B
2528 if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
2529 return N0.getOperand(0);
2531 // fold ((A-B)+(C-A)) -> (C-B)
2532 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
2533 N0.getOperand(0) == N1.getOperand(1))
2534 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
2537 // fold ((A-B)+(B-C)) -> (A-C)
2538 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
2539 N0.getOperand(1) == N1.getOperand(0))
2540 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
2543 // fold (A+(B-(A+C))) to (B-C)
2544 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
2545 N0 == N1.getOperand(1).getOperand(0))
2546 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
2547 N1.getOperand(1).getOperand(1));
2549 // fold (A+(B-(C+A))) to (B-C)
2550 if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
2551 N0 == N1.getOperand(1).getOperand(1))
2552 return DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(0),
2553 N1.getOperand(1).getOperand(0));
2555 // fold (A+((B-A)+or-C)) to (B+or-C)
2556 if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
2557 N1.getOperand(0).getOpcode() == ISD::SUB &&
2558 N0 == N1.getOperand(0).getOperand(1))
2559 return DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0).getOperand(0),
2562 // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
2563 if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB &&
2564 N0->hasOneUse() && N1->hasOneUse()) {
2565 SDValue N00 = N0.getOperand(0);
2566 SDValue N01 = N0.getOperand(1);
2567 SDValue N10 = N1.getOperand(0);
2568 SDValue N11 = N1.getOperand(1);
2570 if (isConstantOrConstantVector(N00) || isConstantOrConstantVector(N10))
2571 return DAG.getNode(ISD::SUB, DL, VT,
2572 DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
2573 DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
2576 // fold (add (umax X, C), -C) --> (usubsat X, C)
2577 if (N0.getOpcode() == ISD::UMAX && hasOperation(ISD::USUBSAT, VT)) {
2578 auto MatchUSUBSAT = [](ConstantSDNode *Max, ConstantSDNode *Op) {
2579 return (!Max && !Op) ||
2580 (Max && Op && Max->getAPIntValue() == (-Op->getAPIntValue()));
2582 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchUSUBSAT,
2583 /*AllowUndefs*/ true))
2584 return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0),
2588 if (SimplifyDemandedBits(SDValue(N, 0)))
2589 return SDValue(N, 0);
2591 if (isOneOrOneSplat(N1)) {
2592 // fold (add (xor a, -1), 1) -> (sub 0, a)
2593 if (isBitwiseNot(N0))
2594 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
2597 // fold (add (add (xor a, -1), b), 1) -> (sub b, a)
2598 if (N0.getOpcode() == ISD::ADD) {
2601 if (isBitwiseNot(N0.getOperand(0))) {
2602 A = N0.getOperand(1);
2603 Xor = N0.getOperand(0);
2604 } else if (isBitwiseNot(N0.getOperand(1))) {
2605 A = N0.getOperand(0);
2606 Xor = N0.getOperand(1);
2610 return DAG.getNode(ISD::SUB, DL, VT, A, Xor.getOperand(0));
2614 // add (add x, y), 1
2615 // And if the target does not like this form then turn into:
2616 // sub y, (xor x, -1)
2617 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
2619 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
2620 DAG.getAllOnesConstant(DL, VT));
2621 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
2625 // (x - y) + -1 -> add (xor y, -1), x
2626 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
2627 isAllOnesOrAllOnesSplat(N1)) {
2628 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1), N1);
2629 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
2632 if (SDValue Combined = visitADDLikeCommutative(N0, N1, N))
2635 if (SDValue Combined = visitADDLikeCommutative(N1, N0, N))
2641 SDValue DAGCombiner::visitADD(SDNode *N) {
2642 SDValue N0 = N->getOperand(0);
2643 SDValue N1 = N->getOperand(1);
2644 EVT VT = N0.getValueType();
2647 if (SDValue Combined = visitADDLike(N))
2650 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
2653 if (SDValue V = foldAddSubOfSignBit(N, DAG))
2656 // fold (a+b) -> (a|b) iff a and b share no bits.
2657 if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
2658 DAG.haveNoCommonBitsSet(N0, N1))
2659 return DAG.getNode(ISD::OR, DL, VT, N0, N1);
2661 // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
2662 if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {
2663 const APInt &C0 = N0->getConstantOperandAPInt(0);
2664 const APInt &C1 = N1->getConstantOperandAPInt(0);
2665 return DAG.getVScale(DL, VT, C0 + C1);
2668 // fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2)
2669 if ((N0.getOpcode() == ISD::ADD) &&
2670 (N0.getOperand(1).getOpcode() == ISD::VSCALE) &&
2671 (N1.getOpcode() == ISD::VSCALE)) {
2672 const APInt &VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
2673 const APInt &VS1 = N1->getConstantOperandAPInt(0);
2674 SDValue VS = DAG.getVScale(DL, VT, VS0 + VS1);
2675 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);
2678 // Fold (add step_vector(c1), step_vector(c2) to step_vector(c1+c2))
2679 if (N0.getOpcode() == ISD::STEP_VECTOR &&
2680 N1.getOpcode() == ISD::STEP_VECTOR) {
2681 const APInt &C0 = N0->getConstantOperandAPInt(0);
2682 const APInt &C1 = N1->getConstantOperandAPInt(0);
2683 APInt NewStep = C0 + C1;
2684 return DAG.getStepVector(DL, VT, NewStep);
2687 // Fold a + step_vector(c1) + step_vector(c2) to a + step_vector(c1+c2)
2688 if ((N0.getOpcode() == ISD::ADD) &&
2689 (N0.getOperand(1).getOpcode() == ISD::STEP_VECTOR) &&
2690 (N1.getOpcode() == ISD::STEP_VECTOR)) {
2691 const APInt &SV0 = N0.getOperand(1)->getConstantOperandAPInt(0);
2692 const APInt &SV1 = N1->getConstantOperandAPInt(0);
2693 APInt NewStep = SV0 + SV1;
2694 SDValue SV = DAG.getStepVector(DL, VT, NewStep);
2695 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), SV);
2701 SDValue DAGCombiner::visitADDSAT(SDNode *N) {
2702 unsigned Opcode = N->getOpcode();
2703 SDValue N0 = N->getOperand(0);
2704 SDValue N1 = N->getOperand(1);
2705 EVT VT = N0.getValueType();
2708 // fold (add_sat x, undef) -> -1
2709 if (N0.isUndef() || N1.isUndef())
2710 return DAG.getAllOnesConstant(DL, VT);
2712 // fold (add_sat c1, c2) -> c3
2713 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
2716 // canonicalize constant to RHS
2717 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
2718 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
2719 return DAG.getNode(Opcode, DL, VT, N1, N0);
2722 if (VT.isVector()) {
2723 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
2726 // fold (add_sat x, 0) -> x, vector edition
2727 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
2731 // fold (add_sat x, 0) -> x
2732 if (isNullConstant(N1))
2735 // If it cannot overflow, transform into an add.
2736 if (Opcode == ISD::UADDSAT)
2737 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
2738 return DAG.getNode(ISD::ADD, DL, VT, N0, N1);
2743 static SDValue getAsCarry(const TargetLowering &TLI, SDValue V) {
2744 bool Masked = false;
2746 // First, peel away TRUNCATE/ZERO_EXTEND/AND nodes due to legalization.
2748 if (V.getOpcode() == ISD::TRUNCATE || V.getOpcode() == ISD::ZERO_EXTEND) {
2749 V = V.getOperand(0);
2753 if (V.getOpcode() == ISD::AND && isOneConstant(V.getOperand(1))) {
2755 V = V.getOperand(0);
2762 // If this is not a carry, return.
2763 if (V.getResNo() != 1)
2766 if (V.getOpcode() != ISD::ADDCARRY && V.getOpcode() != ISD::SUBCARRY &&
2767 V.getOpcode() != ISD::UADDO && V.getOpcode() != ISD::USUBO)
2770 EVT VT = V->getValueType(0);
2771 if (!TLI.isOperationLegalOrCustom(V.getOpcode(), VT))
2774 // If the result is masked, then no matter what kind of bool it is we can
2775 // return. If it isn't, then we need to make sure the bool type is either 0 or
2776 // 1 and not other values.
2778 TLI.getBooleanContents(V.getValueType()) ==
2779 TargetLoweringBase::ZeroOrOneBooleanContent)
2785 /// Given the operands of an add/sub operation, see if the 2nd operand is a
2786 /// masked 0/1 whose source operand is actually known to be 0/-1. If so, invert
2787 /// the opcode and bypass the mask operation.
2788 static SDValue foldAddSubMasked1(bool IsAdd, SDValue N0, SDValue N1,
2789 SelectionDAG &DAG, const SDLoc &DL) {
2790 if (N1.getOpcode() != ISD::AND || !isOneOrOneSplat(N1->getOperand(1)))
2793 EVT VT = N0.getValueType();
2794 if (DAG.ComputeNumSignBits(N1.getOperand(0)) != VT.getScalarSizeInBits())
2797 // add N0, (and (AssertSext X, i1), 1) --> sub N0, X
2798 // sub N0, (and (AssertSext X, i1), 1) --> add N0, X
2799 return DAG.getNode(IsAdd ? ISD::SUB : ISD::ADD, DL, VT, N0, N1.getOperand(0));
2802 /// Helper for doing combines based on N0 and N1 being added to each other.
2803 SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
2804 SDNode *LocReference) {
2805 EVT VT = N0.getValueType();
2806 SDLoc DL(LocReference);
2808 // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
2809 if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
2810 isNullOrNullSplat(N1.getOperand(0).getOperand(0)))
2811 return DAG.getNode(ISD::SUB, DL, VT, N0,
2812 DAG.getNode(ISD::SHL, DL, VT,
2813 N1.getOperand(0).getOperand(1),
2816 if (SDValue V = foldAddSubMasked1(true, N0, N1, DAG, DL))
2820 // add (add x, 1), y
2821 // And if the target does not like this form then turn into:
2822 // sub y, (xor x, -1)
2823 if (!TLI.preferIncOfAddToSubOfNot(VT) && N0.getOpcode() == ISD::ADD &&
2824 N0.hasOneUse() && isOneOrOneSplat(N0.getOperand(1))) {
2825 SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
2826 DAG.getAllOnesConstant(DL, VT));
2827 return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
2830 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse()) {
2831 // Hoist one-use subtraction by non-opaque constant:
2832 // (x - C) + y -> (x + y) - C
2833 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
2834 if (isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
2835 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), N1);
2836 return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
2838 // Hoist one-use subtraction from non-opaque constant:
2839 // (C - x) + y -> (y - x) + C
2840 if (isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
2841 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N1, N0.getOperand(1));
2842 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(0));
2846 // If the target's bool is represented as 0/1, prefer to make this 'sub 0/1'
2847 // rather than 'add 0/-1' (the zext should get folded).
2848 // add (sext i1 Y), X --> sub X, (zext i1 Y)
2849 if (N0.getOpcode() == ISD::SIGN_EXTEND &&
2850 N0.getOperand(0).getScalarValueSizeInBits() == 1 &&
2851 TLI.getBooleanContents(VT) == TargetLowering::ZeroOrOneBooleanContent) {
2852 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
2853 return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
2856 // add X, (sextinreg Y i1) -> sub X, (and Y 1)
2857 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
2858 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
2859 if (TN->getVT() == MVT::i1) {
2860 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
2861 DAG.getConstant(1, DL, VT));
2862 return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
2866 // (add X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
2867 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1)) &&
2869 return DAG.getNode(ISD::ADDCARRY, DL, N1->getVTList(),
2870 N0, N1.getOperand(0), N1.getOperand(2));
2872 // (add X, Carry) -> (addcarry X, 0, Carry)
2873 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
2874 if (SDValue Carry = getAsCarry(TLI, N1))
2875 return DAG.getNode(ISD::ADDCARRY, DL,
2876 DAG.getVTList(VT, Carry.getValueType()), N0,
2877 DAG.getConstant(0, DL, VT), Carry);
2882 SDValue DAGCombiner::visitADDC(SDNode *N) {
2883 SDValue N0 = N->getOperand(0);
2884 SDValue N1 = N->getOperand(1);
2885 EVT VT = N0.getValueType();
2888 // If the flag result is dead, turn this into an ADD.
2889 if (!N->hasAnyUseOfValue(1))
2890 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2891 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
2893 // canonicalize constant to RHS.
2894 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
2895 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
2897 return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
2899 // fold (addc x, 0) -> x + no carry out
2900 if (isNullConstant(N1))
2901 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
2904 // If it cannot overflow, transform into an add.
2905 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
2906 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2907 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
2913 * Flips a boolean if it is cheaper to compute. If the Force parameters is set,
2914 * then the flip also occurs if computing the inverse is the same cost.
2915 * This function returns an empty SDValue in case it cannot flip the boolean
2916 * without increasing the cost of the computation. If you want to flip a boolean
2917 * no matter what, use DAG.getLogicalNOT.
2919 static SDValue extractBooleanFlip(SDValue V, SelectionDAG &DAG,
2920 const TargetLowering &TLI,
2922 if (Force && isa<ConstantSDNode>(V))
2923 return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
2925 if (V.getOpcode() != ISD::XOR)
2928 ConstantSDNode *Const = isConstOrConstSplat(V.getOperand(1), false);
2932 EVT VT = V.getValueType();
2934 bool IsFlip = false;
2935 switch(TLI.getBooleanContents(VT)) {
2936 case TargetLowering::ZeroOrOneBooleanContent:
2937 IsFlip = Const->isOne();
2939 case TargetLowering::ZeroOrNegativeOneBooleanContent:
2940 IsFlip = Const->isAllOnes();
2942 case TargetLowering::UndefinedBooleanContent:
2943 IsFlip = (Const->getAPIntValue() & 0x01) == 1;
2948 return V.getOperand(0);
2950 return DAG.getLogicalNOT(SDLoc(V), V, V.getValueType());
2954 SDValue DAGCombiner::visitADDO(SDNode *N) {
2955 SDValue N0 = N->getOperand(0);
2956 SDValue N1 = N->getOperand(1);
2957 EVT VT = N0.getValueType();
2958 bool IsSigned = (ISD::SADDO == N->getOpcode());
2960 EVT CarryVT = N->getValueType(1);
2963 // If the flag result is dead, turn this into an ADD.
2964 if (!N->hasAnyUseOfValue(1))
2965 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2966 DAG.getUNDEF(CarryVT));
2968 // canonicalize constant to RHS.
2969 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
2970 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
2971 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
2973 // fold (addo x, 0) -> x + no carry out
2974 if (isNullOrNullSplat(N1))
2975 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
2978 // If it cannot overflow, transform into an add.
2979 if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
2980 return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
2981 DAG.getConstant(0, DL, CarryVT));
2983 // fold (uaddo (xor a, -1), 1) -> (usub 0, a) and flip carry.
2984 if (isBitwiseNot(N0) && isOneOrOneSplat(N1)) {
2985 SDValue Sub = DAG.getNode(ISD::USUBO, DL, N->getVTList(),
2986 DAG.getConstant(0, DL, VT), N0.getOperand(0));
2988 N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
2991 if (SDValue Combined = visitUADDOLike(N0, N1, N))
2994 if (SDValue Combined = visitUADDOLike(N1, N0, N))
3001 SDValue DAGCombiner::visitUADDOLike(SDValue N0, SDValue N1, SDNode *N) {
3002 EVT VT = N0.getValueType();
3006 // (uaddo X, (addcarry Y, 0, Carry)) -> (addcarry X, Y, Carry)
3007 // If Y + 1 cannot overflow.
3008 if (N1.getOpcode() == ISD::ADDCARRY && isNullConstant(N1.getOperand(1))) {
3009 SDValue Y = N1.getOperand(0);
3010 SDValue One = DAG.getConstant(1, SDLoc(N), Y.getValueType());
3011 if (DAG.computeOverflowKind(Y, One) == SelectionDAG::OFK_Never)
3012 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0, Y,
3016 // (uaddo X, Carry) -> (addcarry X, 0, Carry)
3017 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT))
3018 if (SDValue Carry = getAsCarry(TLI, N1))
3019 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), N0,
3020 DAG.getConstant(0, SDLoc(N), VT), Carry);
3025 SDValue DAGCombiner::visitADDE(SDNode *N) {
3026 SDValue N0 = N->getOperand(0);
3027 SDValue N1 = N->getOperand(1);
3028 SDValue CarryIn = N->getOperand(2);
3030 // canonicalize constant to RHS
3031 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
3032 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
3034 return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
3037 // fold (adde x, y, false) -> (addc x, y)
3038 if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
3039 return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);
3044 SDValue DAGCombiner::visitADDCARRY(SDNode *N) {
3045 SDValue N0 = N->getOperand(0);
3046 SDValue N1 = N->getOperand(1);
3047 SDValue CarryIn = N->getOperand(2);
3050 // canonicalize constant to RHS
3051 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
3052 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
3054 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn);
3056 // fold (addcarry x, y, false) -> (uaddo x, y)
3057 if (isNullConstant(CarryIn)) {
3058 if (!LegalOperations ||
3059 TLI.isOperationLegalOrCustom(ISD::UADDO, N->getValueType(0)))
3060 return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1);
3063 // fold (addcarry 0, 0, X) -> (and (ext/trunc X), 1) and no carry.
3064 if (isNullConstant(N0) && isNullConstant(N1)) {
3065 EVT VT = N0.getValueType();
3066 EVT CarryVT = CarryIn.getValueType();
3067 SDValue CarryExt = DAG.getBoolExtOrTrunc(CarryIn, DL, VT, CarryVT);
3068 AddToWorklist(CarryExt.getNode());
3069 return CombineTo(N, DAG.getNode(ISD::AND, DL, VT, CarryExt,
3070 DAG.getConstant(1, DL, VT)),
3071 DAG.getConstant(0, DL, CarryVT));
3074 if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N))
3077 if (SDValue Combined = visitADDCARRYLike(N1, N0, CarryIn, N))
3083 SDValue DAGCombiner::visitSADDO_CARRY(SDNode *N) {
3084 SDValue N0 = N->getOperand(0);
3085 SDValue N1 = N->getOperand(1);
3086 SDValue CarryIn = N->getOperand(2);
3089 // canonicalize constant to RHS
3090 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
3091 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
3093 return DAG.getNode(ISD::SADDO_CARRY, DL, N->getVTList(), N1, N0, CarryIn);
3095 // fold (saddo_carry x, y, false) -> (saddo x, y)
3096 if (isNullConstant(CarryIn)) {
3097 if (!LegalOperations ||
3098 TLI.isOperationLegalOrCustom(ISD::SADDO, N->getValueType(0)))
3099 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0, N1);
3106 * If we are facing some sort of diamond carry propapagtion pattern try to
3107 * break it up to generate something like:
3108 * (addcarry X, 0, (addcarry A, B, Z):Carry)
3110 * The end result is usually an increase in operation required, but because the
3111 * carry is now linearized, other tranforms can kick in and optimize the DAG.
3113 * Patterns typically look something like
3118 * | (addcarry *, 0, Z)
3122 * (addcarry X, *, *)
3124 * But numerous variation exist. Our goal is to identify A, B, X and Z and
3125 * produce a combine with a single path for carry propagation.
3127 static SDValue combineADDCARRYDiamond(DAGCombiner &Combiner, SelectionDAG &DAG,
3128 SDValue X, SDValue Carry0, SDValue Carry1,
3130 if (Carry1.getResNo() != 1 || Carry0.getResNo() != 1)
3132 if (Carry1.getOpcode() != ISD::UADDO)
3138 * First look for a suitable Z. It will present itself in the form of
3139 * (addcarry Y, 0, Z) or its equivalent (uaddo Y, 1) for Z=true
3141 if (Carry0.getOpcode() == ISD::ADDCARRY &&
3142 isNullConstant(Carry0.getOperand(1))) {
3143 Z = Carry0.getOperand(2);
3144 } else if (Carry0.getOpcode() == ISD::UADDO &&
3145 isOneConstant(Carry0.getOperand(1))) {
3146 EVT VT = Combiner.getSetCCResultType(Carry0.getValueType());
3147 Z = DAG.getConstant(1, SDLoc(Carry0.getOperand(1)), VT);
3149 // We couldn't find a suitable Z.
3154 auto cancelDiamond = [&](SDValue A,SDValue B) {
3156 SDValue NewY = DAG.getNode(ISD::ADDCARRY, DL, Carry0->getVTList(), A, B, Z);
3157 Combiner.AddToWorklist(NewY.getNode());
3158 return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), X,
3159 DAG.getConstant(0, DL, X.getValueType()),
3168 * (addcarry *, 0, Z)
3170 if (Carry0.getOperand(0) == Carry1.getValue(0)) {
3171 return cancelDiamond(Carry1.getOperand(0), Carry1.getOperand(1));
3175 * (addcarry A, 0, Z)
3181 if (Carry1.getOperand(0) == Carry0.getValue(0)) {
3182 return cancelDiamond(Carry0.getOperand(0), Carry1.getOperand(1));
3185 if (Carry1.getOperand(1) == Carry0.getValue(0)) {
3186 return cancelDiamond(Carry1.getOperand(0), Carry0.getOperand(0));
3192 // If we are facing some sort of diamond carry/borrow in/out pattern try to
3193 // match patterns like:
3195 // (uaddo A, B) CarryIn
3198 // PartialSum PartialCarryOutX /
3200 // | ____|____________/
3202 // (uaddo *, *) \________
3205 // | PartialCarryOutY |
3208 // AddCarrySum | ______/
3210 // CarryOut = (or *, *)
3212 // And generate ADDCARRY (or SUBCARRY) with two result values:
3214 // {AddCarrySum, CarryOut} = (addcarry A, B, CarryIn)
3216 // Our goal is to identify A, B, and CarryIn and produce ADDCARRY/SUBCARRY with
3217 // a single path for carry/borrow out propagation:
3218 static SDValue combineCarryDiamond(SelectionDAG &DAG, const TargetLowering &TLI,
3219 SDValue N0, SDValue N1, SDNode *N) {
3220 SDValue Carry0 = getAsCarry(TLI, N0);
3223 SDValue Carry1 = getAsCarry(TLI, N1);
3227 unsigned Opcode = Carry0.getOpcode();
3228 if (Opcode != Carry1.getOpcode())
3230 if (Opcode != ISD::UADDO && Opcode != ISD::USUBO)
3233 // Canonicalize the add/sub of A and B (the top node in the above ASCII art)
3234 // as Carry0 and the add/sub of the carry in as Carry1 (the middle node).
3235 if (Carry1.getNode()->isOperandOf(Carry0.getNode()))
3236 std::swap(Carry0, Carry1);
3238 // Check if nodes are connected in expected way.
3239 if (Carry1.getOperand(0) != Carry0.getValue(0) &&
3240 Carry1.getOperand(1) != Carry0.getValue(0))
3243 // The carry in value must be on the righthand side for subtraction.
3244 unsigned CarryInOperandNum =
3245 Carry1.getOperand(0) == Carry0.getValue(0) ? 1 : 0;
3246 if (Opcode == ISD::USUBO && CarryInOperandNum != 1)
3248 SDValue CarryIn = Carry1.getOperand(CarryInOperandNum);
3250 unsigned NewOp = Opcode == ISD::UADDO ? ISD::ADDCARRY : ISD::SUBCARRY;
3251 if (!TLI.isOperationLegalOrCustom(NewOp, Carry0.getValue(0).getValueType()))
3254 // Verify that the carry/borrow in is plausibly a carry/borrow bit.
3255 // TODO: make getAsCarry() aware of how partial carries are merged.
3256 if (CarryIn.getOpcode() != ISD::ZERO_EXTEND)
3258 CarryIn = CarryIn.getOperand(0);
3259 if (CarryIn.getValueType() != MVT::i1)
3264 DAG.getNode(NewOp, DL, Carry1->getVTList(), Carry0.getOperand(0),
3265 Carry0.getOperand(1), CarryIn);
3267 // Please note that because we have proven that the result of the UADDO/USUBO
3268 // of A and B feeds into the UADDO/USUBO that does the carry/borrow in, we can
3269 // therefore prove that if the first UADDO/USUBO overflows, the second
3270 // UADDO/USUBO cannot. For example consider 8-bit numbers where 0xFF is the
3273 // 0xFF + 0xFF == 0xFE with carry but 0xFE + 1 does not carry
3274 // 0x00 - 0xFF == 1 with a carry/borrow but 1 - 1 == 0 (no carry/borrow)
3276 // This is important because it means that OR and XOR can be used to merge
3277 // carry flags; and that AND can return a constant zero.
3279 // TODO: match other operations that can merge flags (ADD, etc)
3280 DAG.ReplaceAllUsesOfValueWith(Carry1.getValue(0), Merged.getValue(0));
3281 if (N->getOpcode() == ISD::AND)
3282 return DAG.getConstant(0, DL, MVT::i1);
3283 return Merged.getValue(1);
3286 SDValue DAGCombiner::visitADDCARRYLike(SDValue N0, SDValue N1, SDValue CarryIn,
3288 // fold (addcarry (xor a, -1), b, c) -> (subcarry b, a, !c) and flip carry.
3289 if (isBitwiseNot(N0))
3290 if (SDValue NotC = extractBooleanFlip(CarryIn, DAG, TLI, true)) {
3292 SDValue Sub = DAG.getNode(ISD::SUBCARRY, DL, N->getVTList(), N1,
3293 N0.getOperand(0), NotC);
3295 N, Sub, DAG.getLogicalNOT(DL, Sub.getValue(1), Sub->getValueType(1)));
3298 // Iff the flag result is dead:
3299 // (addcarry (add|uaddo X, Y), 0, Carry) -> (addcarry X, Y, Carry)
3300 // Don't do this if the Carry comes from the uaddo. It won't remove the uaddo
3301 // or the dependency between the instructions.
3302 if ((N0.getOpcode() == ISD::ADD ||
3303 (N0.getOpcode() == ISD::UADDO && N0.getResNo() == 0 &&
3304 N0.getValue(1) != CarryIn)) &&
3305 isNullConstant(N1) && !N->hasAnyUseOfValue(1))
3306 return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(),
3307 N0.getOperand(0), N0.getOperand(1), CarryIn);
3310 * When one of the addcarry argument is itself a carry, we may be facing
3311 * a diamond carry propagation. In which case we try to transform the DAG
3312 * to ensure linear carry propagation if that is possible.
3314 if (auto Y = getAsCarry(TLI, N1)) {
3315 // Because both are carries, Y and Z can be swapped.
3316 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, Y, CarryIn, N))
3318 if (auto R = combineADDCARRYDiamond(*this, DAG, N0, CarryIn, Y, N))
3325 // Attempt to create a USUBSAT(LHS, RHS) node with DstVT, performing a
3326 // clamp/truncation if necessary.
3327 static SDValue getTruncatedUSUBSAT(EVT DstVT, EVT SrcVT, SDValue LHS,
3328 SDValue RHS, SelectionDAG &DAG,
3330 assert(DstVT.getScalarSizeInBits() <= SrcVT.getScalarSizeInBits() &&
3331 "Illegal truncation");
3334 return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS);
3336 // If the LHS is zero-extended then we can perform the USUBSAT as DstVT by
3338 APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(),
3339 DstVT.getScalarSizeInBits());
3340 if (!DAG.MaskedValueIsZero(LHS, UpperBits))
3344 DAG.getConstant(APInt::getLowBitsSet(SrcVT.getScalarSizeInBits(),
3345 DstVT.getScalarSizeInBits()),
3347 RHS = DAG.getNode(ISD::UMIN, DL, SrcVT, RHS, SatLimit);
3348 RHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, RHS);
3349 LHS = DAG.getNode(ISD::TRUNCATE, DL, DstVT, LHS);
3350 return DAG.getNode(ISD::USUBSAT, DL, DstVT, LHS, RHS);
3353 // Try to find umax(a,b) - b or a - umin(a,b) patterns that may be converted to
3354 // usubsat(a,b), optionally as a truncated type.
3355 SDValue DAGCombiner::foldSubToUSubSat(EVT DstVT, SDNode *N) {
3356 if (N->getOpcode() != ISD::SUB ||
3357 !(!LegalOperations || hasOperation(ISD::USUBSAT, DstVT)))
3360 EVT SubVT = N->getValueType(0);
3361 SDValue Op0 = N->getOperand(0);
3362 SDValue Op1 = N->getOperand(1);
3364 // Try to find umax(a,b) - b or a - umin(a,b) patterns
3365 // they may be converted to usubsat(a,b).
3366 if (Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
3367 SDValue MaxLHS = Op0.getOperand(0);
3368 SDValue MaxRHS = Op0.getOperand(1);
3370 return getTruncatedUSUBSAT(DstVT, SubVT, MaxRHS, Op1, DAG, SDLoc(N));
3372 return getTruncatedUSUBSAT(DstVT, SubVT, MaxLHS, Op1, DAG, SDLoc(N));
3375 if (Op1.getOpcode() == ISD::UMIN && Op1.hasOneUse()) {
3376 SDValue MinLHS = Op1.getOperand(0);
3377 SDValue MinRHS = Op1.getOperand(1);
3379 return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinRHS, DAG, SDLoc(N));
3381 return getTruncatedUSUBSAT(DstVT, SubVT, Op0, MinLHS, DAG, SDLoc(N));
3384 // sub(a,trunc(umin(zext(a),b))) -> usubsat(a,trunc(umin(b,SatLimit)))
3385 if (Op1.getOpcode() == ISD::TRUNCATE &&
3386 Op1.getOperand(0).getOpcode() == ISD::UMIN &&
3387 Op1.getOperand(0).hasOneUse()) {
3388 SDValue MinLHS = Op1.getOperand(0).getOperand(0);
3389 SDValue MinRHS = Op1.getOperand(0).getOperand(1);
3390 if (MinLHS.getOpcode() == ISD::ZERO_EXTEND && MinLHS.getOperand(0) == Op0)
3391 return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinLHS, MinRHS,
3393 if (MinRHS.getOpcode() == ISD::ZERO_EXTEND && MinRHS.getOperand(0) == Op0)
3394 return getTruncatedUSUBSAT(DstVT, MinLHS.getValueType(), MinRHS, MinLHS,
3401 // Since it may not be valid to emit a fold to zero for vector initializers
3402 // check if we can before folding.
3403 static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
3404 SelectionDAG &DAG, bool LegalOperations) {
3406 return DAG.getConstant(0, DL, VT);
3407 if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
3408 return DAG.getConstant(0, DL, VT);
3412 SDValue DAGCombiner::visitSUB(SDNode *N) {
3413 SDValue N0 = N->getOperand(0);
3414 SDValue N1 = N->getOperand(1);
3415 EVT VT = N0.getValueType();
3418 auto PeekThroughFreeze = [](SDValue N) {
3419 if (N->getOpcode() == ISD::FREEZE && N.hasOneUse())
3420 return N->getOperand(0);
3424 // fold (sub x, x) -> 0
3425 // FIXME: Refactor this and xor and other similar operations together.
3426 if (PeekThroughFreeze(N0) == PeekThroughFreeze(N1))
3427 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
3429 // fold (sub c1, c2) -> c3
3430 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
3434 if (VT.isVector()) {
3435 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
3438 // fold (sub x, 0) -> x, vector edition
3439 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
3443 if (SDValue NewSel = foldBinOpIntoSelect(N))
3446 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
3448 // fold (sub x, c) -> (add x, -c)
3450 return DAG.getNode(ISD::ADD, DL, VT, N0,
3451 DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
3454 if (isNullOrNullSplat(N0)) {
3455 unsigned BitWidth = VT.getScalarSizeInBits();
3456 // Right-shifting everything out but the sign bit followed by negation is
3457 // the same as flipping arithmetic/logical shift type without the negation:
3458 // -(X >>u 31) -> (X >>s 31)
3459 // -(X >>s 31) -> (X >>u 31)
3460 if (N1->getOpcode() == ISD::SRA || N1->getOpcode() == ISD::SRL) {
3461 ConstantSDNode *ShiftAmt = isConstOrConstSplat(N1.getOperand(1));
3462 if (ShiftAmt && ShiftAmt->getAPIntValue() == (BitWidth - 1)) {
3463 auto NewSh = N1->getOpcode() == ISD::SRA ? ISD::SRL : ISD::SRA;
3464 if (!LegalOperations || TLI.isOperationLegal(NewSh, VT))
3465 return DAG.getNode(NewSh, DL, VT, N1.getOperand(0), N1.getOperand(1));
3469 // 0 - X --> 0 if the sub is NUW.
3470 if (N->getFlags().hasNoUnsignedWrap())
3473 if (DAG.MaskedValueIsZero(N1, ~APInt::getSignMask(BitWidth))) {
3474 // N1 is either 0 or the minimum signed value. If the sub is NSW, then
3475 // N1 must be 0 because negating the minimum signed value is undefined.
3476 if (N->getFlags().hasNoSignedWrap())
3479 // 0 - X --> X if X is 0 or the minimum signed value.
3483 // Convert 0 - abs(x).
3484 if (N1.getOpcode() == ISD::ABS && N1.hasOneUse() &&
3485 !TLI.isOperationLegalOrCustom(ISD::ABS, VT))
3486 if (SDValue Result = TLI.expandABS(N1.getNode(), DAG, true))
3489 // Fold neg(splat(neg(x)) -> splat(x)
3490 if (VT.isVector()) {
3491 SDValue N1S = DAG.getSplatValue(N1, true);
3492 if (N1S && N1S.getOpcode() == ISD::SUB &&
3493 isNullConstant(N1S.getOperand(0))) {
3494 if (VT.isScalableVector())
3495 return DAG.getSplatVector(VT, DL, N1S.getOperand(1));
3496 return DAG.getSplatBuildVector(VT, DL, N1S.getOperand(1));
3501 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
3502 if (isAllOnesOrAllOnesSplat(N0))
3503 return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
3505 // fold (A - (0-B)) -> A+B
3506 if (N1.getOpcode() == ISD::SUB && isNullOrNullSplat(N1.getOperand(0)))
3507 return DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(1));
3509 // fold A-(A-B) -> B
3510 if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
3511 return N1.getOperand(1);
3513 // fold (A+B)-A -> B
3514 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
3515 return N0.getOperand(1);
3517 // fold (A+B)-B -> A
3518 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
3519 return N0.getOperand(0);
3521 // fold (A+C1)-C2 -> A+(C1-C2)
3522 if (N0.getOpcode() == ISD::ADD) {
3523 SDValue N01 = N0.getOperand(1);
3524 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N01, N1}))
3525 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
3528 // fold C2-(A+C1) -> (C2-C1)-A
3529 if (N1.getOpcode() == ISD::ADD) {
3530 SDValue N11 = N1.getOperand(1);
3531 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11}))
3532 return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
3535 // fold (A-C1)-C2 -> A-(C1+C2)
3536 if (N0.getOpcode() == ISD::SUB) {
3537 SDValue N01 = N0.getOperand(1);
3538 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N01, N1}))
3539 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
3542 // fold (c1-A)-c2 -> (c1-c2)-A
3543 if (N0.getOpcode() == ISD::SUB) {
3544 SDValue N00 = N0.getOperand(0);
3545 if (SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N00, N1}))
3546 return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
3549 // fold ((A+(B+or-C))-B) -> A+or-C
3550 if (N0.getOpcode() == ISD::ADD &&
3551 (N0.getOperand(1).getOpcode() == ISD::SUB ||
3552 N0.getOperand(1).getOpcode() == ISD::ADD) &&
3553 N0.getOperand(1).getOperand(0) == N1)
3554 return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
3555 N0.getOperand(1).getOperand(1));
3557 // fold ((A+(C+B))-B) -> A+C
3558 if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
3559 N0.getOperand(1).getOperand(1) == N1)
3560 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
3561 N0.getOperand(1).getOperand(0));
3563 // fold ((A-(B-C))-C) -> A-B
3564 if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
3565 N0.getOperand(1).getOperand(1) == N1)
3566 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
3567 N0.getOperand(1).getOperand(0));
3569 // fold (A-(B-C)) -> A+(C-B)
3570 if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
3571 return DAG.getNode(ISD::ADD, DL, VT, N0,
3572 DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
3575 // A - (A & B) -> A & (~B)
3576 if (N1.getOpcode() == ISD::AND) {
3577 SDValue A = N1.getOperand(0);
3578 SDValue B = N1.getOperand(1);
3582 (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) {
3584 DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT));
3585 return DAG.getNode(ISD::AND, DL, VT, A, InvB);
3589 // fold (X - (-Y * Z)) -> (X + (Y * Z))
3590 if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
3591 if (N1.getOperand(0).getOpcode() == ISD::SUB &&
3592 isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
3593 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
3594 N1.getOperand(0).getOperand(1),
3596 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
3598 if (N1.getOperand(1).getOpcode() == ISD::SUB &&
3599 isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
3600 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
3602 N1.getOperand(1).getOperand(1));
3603 return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
3607 // If either operand of a sub is undef, the result is undef
3613 if (SDValue V = foldAddSubBoolOfMaskedVal(N, DAG))
3616 if (SDValue V = foldAddSubOfSignBit(N, DAG))
3619 if (SDValue V = foldAddSubMasked1(false, N0, N1, DAG, SDLoc(N)))
3622 if (SDValue V = foldSubToUSubSat(VT, N))
3625 // (x - y) - 1 -> add (xor y, -1), x
3626 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && isOneOrOneSplat(N1)) {
3627 SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
3628 DAG.getAllOnesConstant(DL, VT));
3629 return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
3633 // sub y, (xor x, -1)
3634 // And if the target does not like this form then turn into:
3635 // add (add x, y), 1
3636 if (TLI.preferIncOfAddToSubOfNot(VT) && N1.hasOneUse() && isBitwiseNot(N1)) {
3637 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, N1.getOperand(0));
3638 return DAG.getNode(ISD::ADD, DL, VT, Add, DAG.getConstant(1, DL, VT));
3641 // Hoist one-use addition by non-opaque constant:
3642 // (x + C) - y -> (x - y) + C
3643 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
3644 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
3645 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
3646 return DAG.getNode(ISD::ADD, DL, VT, Sub, N0.getOperand(1));
3648 // y - (x + C) -> (y - x) - C
3649 if (N1.getOpcode() == ISD::ADD && N1.hasOneUse() &&
3650 isConstantOrConstantVector(N1.getOperand(1), /*NoOpaques=*/true)) {
3651 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, N1.getOperand(0));
3652 return DAG.getNode(ISD::SUB, DL, VT, Sub, N1.getOperand(1));
3654 // (x - C) - y -> (x - y) - C
3655 // This is necessary because SUB(X,C) -> ADD(X,-C) doesn't work for vectors.
3656 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
3657 isConstantOrConstantVector(N0.getOperand(1), /*NoOpaques=*/true)) {
3658 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), N1);
3659 return DAG.getNode(ISD::SUB, DL, VT, Sub, N0.getOperand(1));
3661 // (C - x) - y -> C - (x + y)
3662 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
3663 isConstantOrConstantVector(N0.getOperand(0), /*NoOpaques=*/true)) {
3664 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1), N1);
3665 return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), Add);
3668 // If the target's bool is represented as 0/-1, prefer to make this 'add 0/-1'
3669 // rather than 'sub 0/1' (the sext should get folded).
3670 // sub X, (zext i1 Y) --> add X, (sext i1 Y)
3671 if (N1.getOpcode() == ISD::ZERO_EXTEND &&
3672 N1.getOperand(0).getScalarValueSizeInBits() == 1 &&
3673 TLI.getBooleanContents(VT) ==
3674 TargetLowering::ZeroOrNegativeOneBooleanContent) {
3675 SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N1.getOperand(0));
3676 return DAG.getNode(ISD::ADD, DL, VT, N0, SExt);
3679 // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X)
3680 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
3681 if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) {
3682 SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1);
3683 SDValue S0 = N1.getOperand(0);
3684 if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0))
3685 if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
3686 if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
3687 return DAG.getNode(ISD::ABS, SDLoc(N), VT, S0);
3691 // If the relocation model supports it, consider symbol offsets.
3692 if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
3693 if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
3694 // fold (sub Sym, c) -> Sym-c
3695 if (N1C && GA->getOpcode() == ISD::GlobalAddress)
3696 return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
3698 (uint64_t)N1C->getSExtValue());
3699 // fold (sub Sym+c1, Sym+c2) -> c1-c2
3700 if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
3701 if (GA->getGlobal() == GB->getGlobal())
3702 return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
3706 // sub X, (sextinreg Y i1) -> add X, (and Y 1)
3707 if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
3708 VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
3709 if (TN->getVT() == MVT::i1) {
3710 SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
3711 DAG.getConstant(1, DL, VT));
3712 return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
3716 // canonicalize (sub X, (vscale * C)) to (add X, (vscale * -C))
3717 if (N1.getOpcode() == ISD::VSCALE) {
3718 const APInt &IntVal = N1.getConstantOperandAPInt(0);
3719 return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
3722 // canonicalize (sub X, step_vector(C)) to (add X, step_vector(-C))
3723 if (N1.getOpcode() == ISD::STEP_VECTOR && N1.hasOneUse()) {
3724 APInt NewStep = -N1.getConstantOperandAPInt(0);
3725 return DAG.getNode(ISD::ADD, DL, VT, N0,
3726 DAG.getStepVector(DL, VT, NewStep));
3729 // Prefer an add for more folding potential and possibly better codegen:
3730 // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)
3731 if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
3732 SDValue ShAmt = N1.getOperand(1);
3733 ConstantSDNode *ShAmtC = isConstOrConstSplat(ShAmt);
3735 ShAmtC->getAPIntValue() == (N1.getScalarValueSizeInBits() - 1)) {
3736 SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, N1.getOperand(0), ShAmt);
3737 return DAG.getNode(ISD::ADD, DL, VT, N0, SRA);
3741 // As with the previous fold, prefer add for more folding potential.
3742 // Subtracting SMIN/0 is the same as adding SMIN/0:
3743 // N0 - (X << BW-1) --> N0 + (X << BW-1)
3744 if (N1.getOpcode() == ISD::SHL) {
3745 ConstantSDNode *ShlC = isConstOrConstSplat(N1.getOperand(1));
3746 if (ShlC && ShlC->getAPIntValue() == VT.getScalarSizeInBits() - 1)
3747 return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
3750 if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
3751 // (sub Carry, X) -> (addcarry (sub 0, X), 0, Carry)
3752 if (SDValue Carry = getAsCarry(TLI, N0)) {
3754 SDValue Zero = DAG.getConstant(0, DL, VT);
3755 SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X);
3756 return DAG.getNode(ISD::ADDCARRY, DL,
3757 DAG.getVTList(VT, Carry.getValueType()), NegX, Zero,
3762 // If there's no chance of borrowing from adjacent bits, then sub is xor:
3763 // sub C0, X --> xor X, C0
3764 if (ConstantSDNode *C0 = isConstOrConstSplat(N0)) {
3765 if (!C0->isOpaque()) {
3766 const APInt &C0Val = C0->getAPIntValue();
3767 const APInt &MaybeOnes = ~DAG.computeKnownBits(N1).Zero;
3768 if ((C0Val - MaybeOnes) == (C0Val ^ MaybeOnes))
3769 return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
3776 SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
3777 SDValue N0 = N->getOperand(0);
3778 SDValue N1 = N->getOperand(1);
3779 EVT VT = N0.getValueType();
3782 // fold (sub_sat x, undef) -> 0
3783 if (N0.isUndef() || N1.isUndef())
3784 return DAG.getConstant(0, DL, VT);
3786 // fold (sub_sat x, x) -> 0
3788 return DAG.getConstant(0, DL, VT);
3790 // fold (sub_sat c1, c2) -> c3
3791 if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))
3795 if (VT.isVector()) {
3796 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
3799 // fold (sub_sat x, 0) -> x, vector edition
3800 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
3804 // fold (sub_sat x, 0) -> x
3805 if (isNullConstant(N1))
3811 SDValue DAGCombiner::visitSUBC(SDNode *N) {
3812 SDValue N0 = N->getOperand(0);
3813 SDValue N1 = N->getOperand(1);
3814 EVT VT = N0.getValueType();
3817 // If the flag result is dead, turn this into an SUB.
3818 if (!N->hasAnyUseOfValue(1))
3819 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
3820 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3822 // fold (subc x, x) -> 0 + no borrow
3824 return CombineTo(N, DAG.getConstant(0, DL, VT),
3825 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3827 // fold (subc x, 0) -> x + no borrow
3828 if (isNullConstant(N1))
3829 return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3831 // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
3832 if (isAllOnesConstant(N0))
3833 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
3834 DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
3839 SDValue DAGCombiner::visitSUBO(SDNode *N) {
3840 SDValue N0 = N->getOperand(0);
3841 SDValue N1 = N->getOperand(1);
3842 EVT VT = N0.getValueType();
3843 bool IsSigned = (ISD::SSUBO == N->getOpcode());
3845 EVT CarryVT = N->getValueType(1);
3848 // If the flag result is dead, turn this into an SUB.
3849 if (!N->hasAnyUseOfValue(1))
3850 return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
3851 DAG.getUNDEF(CarryVT));
3853 // fold (subo x, x) -> 0 + no borrow
3855 return CombineTo(N, DAG.getConstant(0, DL, VT),
3856 DAG.getConstant(0, DL, CarryVT));
3858 ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
3860 // fold (subox, c) -> (addo x, -c)
3861 if (IsSigned && N1C && !N1C->getAPIntValue().isMinSignedValue()) {
3862 return DAG.getNode(ISD::SADDO, DL, N->getVTList(), N0,
3863 DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
3866 // fold (subo x, 0) -> x + no borrow
3867 if (isNullOrNullSplat(N1))
3868 return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
3870 // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
3871 if (!IsSigned && isAllOnesOrAllOnesSplat(N0))
3872 return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
3873 DAG.getConstant(0, DL, CarryVT));
3878 SDValue DAGCombiner::visitSUBE(SDNode *N) {
3879 SDValue N0 = N->getOperand(0);
3880 SDValue N1 = N->getOperand(1);
3881 SDValue CarryIn = N->getOperand(2);
3883 // fold (sube x, y, false) -> (subc x, y)
3884 if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
3885 return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);
3890 SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
3891 SDValue N0 = N->getOperand(0);
3892 SDValue N1 = N->getOperand(1);
3893 SDValue CarryIn = N->getOperand(2);
3895 // fold (subcarry x, y, false) -> (usubo x, y)
3896 if (isNullConstant(CarryIn)) {
3897 if (!LegalOperations ||
3898 TLI.isOperationLegalOrCustom(ISD::USUBO, N->getValueType(0)))
3899 return DAG.getNode(ISD::USUBO, SDLoc(N), N->getVTList(), N0, N1);
3905 SDValue DAGCombiner::visitSSUBO_CARRY(SDNode *N) {
3906 SDValue N0 = N->getOperand(0);
3907 SDValue N1 = N->getOperand(1);
3908 SDValue CarryIn = N->getOperand(2);
3910 // fold (ssubo_carry x, y, false) -> (ssubo x, y)
3911 if (isNullConstant(CarryIn)) {
3912 if (!LegalOperations ||
3913 TLI.isOperationLegalOrCustom(ISD::SSUBO, N->getValueType(0)))
3914 return DAG.getNode(ISD::SSUBO, SDLoc(N), N->getVTList(), N0, N1);
3920 // Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
3922 SDValue DAGCombiner::visitMULFIX(SDNode *N) {
3923 SDValue N0 = N->getOperand(0);
3924 SDValue N1 = N->getOperand(1);
3925 SDValue Scale = N->getOperand(2);
3926 EVT VT = N0.getValueType();
3928 // fold (mulfix x, undef, scale) -> 0
3929 if (N0.isUndef() || N1.isUndef())
3930 return DAG.getConstant(0, SDLoc(N), VT);
3932 // Canonicalize constant to RHS (vector doesn't have to splat)
3933 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
3934 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
3935 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale);
3937 // fold (mulfix x, 0, scale) -> 0
3938 if (isNullConstant(N1))
3939 return DAG.getConstant(0, SDLoc(N), VT);
3944 SDValue DAGCombiner::visitMUL(SDNode *N) {
3945 SDValue N0 = N->getOperand(0);
3946 SDValue N1 = N->getOperand(1);
3947 EVT VT = N0.getValueType();
3950 // fold (mul x, undef) -> 0
3951 if (N0.isUndef() || N1.isUndef())
3952 return DAG.getConstant(0, DL, VT);
3954 // fold (mul c1, c2) -> c1*c2
3955 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, DL, VT, {N0, N1}))
3958 // canonicalize constant to RHS (vector doesn't have to splat)
3959 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
3960 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
3961 return DAG.getNode(ISD::MUL, DL, VT, N1, N0);
3963 bool N1IsConst = false;
3964 bool N1IsOpaqueConst = false;
3968 if (VT.isVector()) {
3969 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
3972 N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
3973 assert((!N1IsConst ||
3974 ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
3975 "Splat APInt should be element width");
3977 N1IsConst = isa<ConstantSDNode>(N1);
3979 ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
3980 N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
3984 // fold (mul x, 0) -> 0
3985 if (N1IsConst && ConstValue1.isZero())
3988 // fold (mul x, 1) -> x
3989 if (N1IsConst && ConstValue1.isOne())
3992 if (SDValue NewSel = foldBinOpIntoSelect(N))
3995 // fold (mul x, -1) -> 0-x
3996 if (N1IsConst && ConstValue1.isAllOnes())
3997 return DAG.getNode(ISD::SUB, DL, VT,
3998 DAG.getConstant(0, DL, VT), N0);
4000 // fold (mul x, (1 << c)) -> x << c
4001 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
4002 DAG.isKnownToBeAPowerOfTwo(N1) &&
4003 (!VT.isVector() || Level <= AfterLegalizeVectorOps)) {
4004 SDValue LogBase2 = BuildLogBase2(N1, DL);
4005 EVT ShiftVT = getShiftAmountTy(N0.getValueType());
4006 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
4007 return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
4010 // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
4011 if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isNegatedPowerOf2()) {
4012 unsigned Log2Val = (-ConstValue1).logBase2();
4013 // FIXME: If the input is something that is easily negated (e.g. a
4014 // single-use add), we should put the negate there.
4015 return DAG.getNode(ISD::SUB, DL, VT,
4016 DAG.getConstant(0, DL, VT),
4017 DAG.getNode(ISD::SHL, DL, VT, N0,
4018 DAG.getConstant(Log2Val, DL,
4019 getShiftAmountTy(N0.getValueType()))));
4022 // Try to transform:
4023 // (1) multiply-by-(power-of-2 +/- 1) into shift and add/sub.
4024 // mul x, (2^N + 1) --> add (shl x, N), x
4025 // mul x, (2^N - 1) --> sub (shl x, N), x
4026 // Examples: x * 33 --> (x << 5) + x
4027 // x * 15 --> (x << 4) - x
4028 // x * -33 --> -((x << 5) + x)
4029 // x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
4030 // (2) multiply-by-(power-of-2 +/- power-of-2) into shifts and add/sub.
4031 // mul x, (2^N + 2^M) --> (add (shl x, N), (shl x, M))
4032 // mul x, (2^N - 2^M) --> (sub (shl x, N), (shl x, M))
4033 // Examples: x * 0x8800 --> (x << 15) + (x << 11)
4034 // x * 0xf800 --> (x << 16) - (x << 11)
4035 // x * -0x8800 --> -((x << 15) + (x << 11))
4036 // x * -0xf800 --> -((x << 16) - (x << 11)) ; (x << 11) - (x << 16)
4037 if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
4038 // TODO: We could handle more general decomposition of any constant by
4039 // having the target set a limit on number of ops and making a
4040 // callback to determine that sequence (similar to sqrt expansion).
4041 unsigned MathOp = ISD::DELETED_NODE;
4042 APInt MulC = ConstValue1.abs();
4043 // The constant `2` should be treated as (2^0 + 1).
4044 unsigned TZeros = MulC == 2 ? 0 : MulC.countTrailingZeros();
4045 MulC.lshrInPlace(TZeros);
4046 if ((MulC - 1).isPowerOf2())
4048 else if ((MulC + 1).isPowerOf2())
4051 if (MathOp != ISD::DELETED_NODE) {
4053 MathOp == ISD::ADD ? (MulC - 1).logBase2() : (MulC + 1).logBase2();
4055 assert(ShAmt < VT.getScalarSizeInBits() &&
4056 "multiply-by-constant generated out of bounds shift");
4058 DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(ShAmt, DL, VT));
4060 TZeros ? DAG.getNode(MathOp, DL, VT, Shl,
4061 DAG.getNode(ISD::SHL, DL, VT, N0,
4062 DAG.getConstant(TZeros, DL, VT)))
4063 : DAG.getNode(MathOp, DL, VT, Shl, N0);
4064 if (ConstValue1.isNegative())
4065 R = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), R);
4070 // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
4071 if (N0.getOpcode() == ISD::SHL) {
4072 SDValue N01 = N0.getOperand(1);
4073 if (SDValue C3 = DAG.FoldConstantArithmetic(ISD::SHL, DL, VT, {N1, N01}))
4074 return DAG.getNode(ISD::MUL, DL, VT, N0.getOperand(0), C3);
4077 // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
4082 // Check for both (mul (shl X, C), Y) and (mul Y, (shl X, C)).
4083 if (N0.getOpcode() == ISD::SHL &&
4084 isConstantOrConstantVector(N0.getOperand(1)) && N0->hasOneUse()) {
4086 } else if (N1.getOpcode() == ISD::SHL &&
4087 isConstantOrConstantVector(N1.getOperand(1)) &&
4093 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, Sh.getOperand(0), Y);
4094 return DAG.getNode(ISD::SHL, DL, VT, Mul, Sh.getOperand(1));
4098 // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
4099 if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
4100 N0.getOpcode() == ISD::ADD &&
4101 DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
4102 isMulAddWithConstProfitable(N, N0, N1))
4105 DAG.getNode(ISD::MUL, SDLoc(N0), VT, N0.getOperand(0), N1),
4106 DAG.getNode(ISD::MUL, SDLoc(N1), VT, N0.getOperand(1), N1));
4108 // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
4109 if (N0.getOpcode() == ISD::VSCALE)
4110 if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {
4111 const APInt &C0 = N0.getConstantOperandAPInt(0);
4112 const APInt &C1 = NC1->getAPIntValue();
4113 return DAG.getVScale(DL, VT, C0 * C1);
4116 // Fold (mul step_vector(C0), C1) to (step_vector(C0 * C1)).
4118 if (N0.getOpcode() == ISD::STEP_VECTOR)
4119 if (ISD::isConstantSplatVector(N1.getNode(), MulVal)) {
4120 const APInt &C0 = N0.getConstantOperandAPInt(0);
4121 APInt NewStep = C0 * MulVal;
4122 return DAG.getStepVector(DL, VT, NewStep);
4125 // Fold ((mul x, 0/undef) -> 0,
4126 // (mul x, 1) -> x) -> x)
4128 // We can replace vectors with '0' and '1' factors with a clearing mask.
4129 if (VT.isFixedLengthVector()) {
4130 unsigned NumElts = VT.getVectorNumElements();
4131 SmallBitVector ClearMask;
4132 ClearMask.reserve(NumElts);
4133 auto IsClearMask = [&ClearMask](ConstantSDNode *V) {
4134 if (!V || V->isZero()) {
4135 ClearMask.push_back(true);
4138 ClearMask.push_back(false);
4141 if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::AND, VT)) &&
4142 ISD::matchUnaryPredicate(N1, IsClearMask, /*AllowUndefs*/ true)) {
4143 assert(N1.getOpcode() == ISD::BUILD_VECTOR && "Unknown constant vector");
4144 EVT LegalSVT = N1.getOperand(0).getValueType();
4145 SDValue Zero = DAG.getConstant(0, DL, LegalSVT);
4146 SDValue AllOnes = DAG.getAllOnesConstant(DL, LegalSVT);
4147 SmallVector<SDValue, 16> Mask(NumElts, AllOnes);
4148 for (unsigned I = 0; I != NumElts; ++I)
4151 return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getBuildVector(VT, DL, Mask));
4156 if (SDValue RMUL = reassociateOps(ISD::MUL, DL, N0, N1, N->getFlags()))
4159 // Simplify the operands using demanded-bits information.
4160 if (SimplifyDemandedBits(SDValue(N, 0)))
4161 return SDValue(N, 0);
4166 /// Return true if divmod libcall is available.
4167 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
4168 const TargetLowering &TLI) {
4170 EVT NodeType = Node->getValueType(0);
4171 if (!NodeType.isSimple())
4173 switch (NodeType.getSimpleVT().SimpleTy) {
4174 default: return false; // No libcall for vector types.
4175 case MVT::i8: LC= isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
4176 case MVT::i16: LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
4177 case MVT::i32: LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
4178 case MVT::i64: LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
4179 case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
4182 return TLI.getLibcallName(LC) != nullptr;
4185 /// Issue divrem if both quotient and remainder are needed.
4186 SDValue DAGCombiner::useDivRem(SDNode *Node) {
4187 if (Node->use_empty())
4188 return SDValue(); // This is a dead node, leave it alone.
4190 unsigned Opcode = Node->getOpcode();
4191 bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
4192 unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
4194 // DivMod lib calls can still work on non-legal types if using lib-calls.
4195 EVT VT = Node->getValueType(0);
4196 if (VT.isVector() || !VT.isInteger())
4199 if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
4202 // If DIVREM is going to get expanded into a libcall,
4203 // but there is no libcall available, then don't combine.
4204 if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
4205 !isDivRemLibcallAvailable(Node, isSigned, TLI))
4208 // If div is legal, it's better to do the normal expansion
4209 unsigned OtherOpcode = 0;
4210 if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) {
4211 OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
4212 if (TLI.isOperationLegalOrCustom(Opcode, VT))
4215 OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
4216 if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
4220 SDValue Op0 = Node->getOperand(0);
4221 SDValue Op1 = Node->getOperand(1);
4223 for (SDNode *User : Op0->uses()) {
4224 if (User == Node || User->getOpcode() == ISD::DELETED_NODE ||
4227 // Convert the other matching node(s), too;
4228 // otherwise, the DIVREM may get target-legalized into something
4229 // target-specific that we won't be able to recognize.
4230 unsigned UserOpc = User->getOpcode();
4231 if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) &&
4232 User->getOperand(0) == Op0 &&
4233 User->getOperand(1) == Op1) {
4235 if (UserOpc == OtherOpcode) {
4236 SDVTList VTs = DAG.getVTList(VT, VT);
4237 combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
4238 } else if (UserOpc == DivRemOpc) {
4239 combined = SDValue(User, 0);
4241 assert(UserOpc == Opcode);
4245 if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV)
4246 CombineTo(User, combined);
4247 else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM)
4248 CombineTo(User, combined.getValue(1));
4254 static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
4255 SDValue N0 = N->getOperand(0);
4256 SDValue N1 = N->getOperand(1);
4257 EVT VT = N->getValueType(0);
4260 unsigned Opc = N->getOpcode();
4261 bool IsDiv = (ISD::SDIV == Opc) || (ISD::UDIV == Opc);
4262 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4264 // X / undef -> undef
4265 // X % undef -> undef
4268 // NOTE: This includes vectors where any divisor element is zero/undef.
4269 if (DAG.isUndef(Opc, {N0, N1}))
4270 return DAG.getUNDEF(VT);
4275 return DAG.getConstant(0, DL, VT);
4279 ConstantSDNode *N0C = isConstOrConstSplat(N0);
4280 if (N0C && N0C->isZero())
4286 return DAG.getConstant(IsDiv ? 1 : 0, DL, VT);
4290 // If this is a boolean op (single-bit element type), we can't have
4291 // division-by-zero or remainder-by-zero, so assume the divisor is 1.
4292 // TODO: Similarly, if we're zero-extending a boolean divisor, then assume
4294 if ((N1C && N1C->isOne()) || (VT.getScalarType() == MVT::i1))
4295 return IsDiv ? N0 : DAG.getConstant(0, DL, VT);
4300 SDValue DAGCombiner::visitSDIV(SDNode *N) {
4301 SDValue N0 = N->getOperand(0);
4302 SDValue N1 = N->getOperand(1);
4303 EVT VT = N->getValueType(0);
4304 EVT CCVT = getSetCCResultType(VT);
4307 // fold (sdiv c1, c2) -> c1/c2
4308 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
4313 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4316 // fold (sdiv X, -1) -> 0-X
4317 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4318 if (N1C && N1C->isAllOnes())
4319 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
4321 // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
4322 if (N1C && N1C->getAPIntValue().isMinSignedValue())
4323 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
4324 DAG.getConstant(1, DL, VT),
4325 DAG.getConstant(0, DL, VT));
4327 if (SDValue V = simplifyDivRem(N, DAG))
4330 if (SDValue NewSel = foldBinOpIntoSelect(N))
4333 // If we know the sign bits of both operands are zero, strength reduce to a
4334 // udiv instead. Handles (X&15) /s 4 -> X&15 >> 2
4335 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
4336 return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
4338 if (SDValue V = visitSDIVLike(N0, N1, N)) {
4339 // If the corresponding remainder node exists, update its users with
4340 // (Dividend - (Quotient * Divisor).
4341 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::SREM, N->getVTList(),
4343 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
4344 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
4345 AddToWorklist(Mul.getNode());
4346 AddToWorklist(Sub.getNode());
4347 CombineTo(RemNode, Sub);
4352 // sdiv, srem -> sdivrem
4353 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
4354 // true. Otherwise, we break the simplification logic in visitREM().
4355 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4356 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
4357 if (SDValue DivRem = useDivRem(N))
4363 static bool isDivisorPowerOfTwo(SDValue Divisor) {
4364 // Helper for determining whether a value is a power-2 constant scalar or a
4365 // vector of such elements.
4366 auto IsPowerOfTwo = [](ConstantSDNode *C) {
4367 if (C->isZero() || C->isOpaque())
4369 if (C->getAPIntValue().isPowerOf2())
4371 if (C->getAPIntValue().isNegatedPowerOf2())
4376 return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo);
4379 SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
4381 EVT VT = N->getValueType(0);
4382 EVT CCVT = getSetCCResultType(VT);
4383 unsigned BitWidth = VT.getScalarSizeInBits();
4385 // fold (sdiv X, pow2) -> simple ops after legalize
4386 // FIXME: We check for the exact bit here because the generic lowering gives
4387 // better results in that case. The target-specific lowering should learn how
4388 // to handle exact sdivs efficiently.
4389 if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) {
4390 // Target-specific implementation of sdiv x, pow2.
4391 if (SDValue Res = BuildSDIVPow2(N))
4394 // Create constants that are functions of the shift amount value.
4395 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
4396 SDValue Bits = DAG.getConstant(BitWidth, DL, ShiftAmtTy);
4397 SDValue C1 = DAG.getNode(ISD::CTTZ, DL, VT, N1);
4398 C1 = DAG.getZExtOrTrunc(C1, DL, ShiftAmtTy);
4399 SDValue Inexact = DAG.getNode(ISD::SUB, DL, ShiftAmtTy, Bits, C1);
4400 if (!isConstantOrConstantVector(Inexact))
4403 // Splat the sign bit into the register
4404 SDValue Sign = DAG.getNode(ISD::SRA, DL, VT, N0,
4405 DAG.getConstant(BitWidth - 1, DL, ShiftAmtTy));
4406 AddToWorklist(Sign.getNode());
4408 // Add (N0 < 0) ? abs2 - 1 : 0;
4409 SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, Sign, Inexact);
4410 AddToWorklist(Srl.getNode());
4411 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Srl);
4412 AddToWorklist(Add.getNode());
4413 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Add, C1);
4414 AddToWorklist(Sra.getNode());
4416 // Special case: (sdiv X, 1) -> X
4417 // Special Case: (sdiv X, -1) -> 0-X
4418 SDValue One = DAG.getConstant(1, DL, VT);
4419 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
4420 SDValue IsOne = DAG.getSetCC(DL, CCVT, N1, One, ISD::SETEQ);
4421 SDValue IsAllOnes = DAG.getSetCC(DL, CCVT, N1, AllOnes, ISD::SETEQ);
4422 SDValue IsOneOrAllOnes = DAG.getNode(ISD::OR, DL, CCVT, IsOne, IsAllOnes);
4423 Sra = DAG.getSelect(DL, VT, IsOneOrAllOnes, N0, Sra);
4425 // If dividing by a positive value, we're done. Otherwise, the result must
4427 SDValue Zero = DAG.getConstant(0, DL, VT);
4428 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, Zero, Sra);
4430 // FIXME: Use SELECT_CC once we improve SELECT_CC constant-folding.
4431 SDValue IsNeg = DAG.getSetCC(DL, CCVT, N1, Zero, ISD::SETLT);
4432 SDValue Res = DAG.getSelect(DL, VT, IsNeg, Sub, Sra);
4436 // If integer divide is expensive and we satisfy the requirements, emit an
4437 // alternate sequence. Targets may check function attributes for size/speed
4439 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4440 if (isConstantOrConstantVector(N1) &&
4441 !TLI.isIntDivCheap(N->getValueType(0), Attr))
4442 if (SDValue Op = BuildSDIV(N))
4448 SDValue DAGCombiner::visitUDIV(SDNode *N) {
4449 SDValue N0 = N->getOperand(0);
4450 SDValue N1 = N->getOperand(1);
4451 EVT VT = N->getValueType(0);
4452 EVT CCVT = getSetCCResultType(VT);
4455 // fold (udiv c1, c2) -> c1/c2
4456 if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
4461 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4464 // fold (udiv X, -1) -> select(X == -1, 1, 0)
4465 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4466 if (N1C && N1C->isAllOnes())
4467 return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
4468 DAG.getConstant(1, DL, VT),
4469 DAG.getConstant(0, DL, VT));
4471 if (SDValue V = simplifyDivRem(N, DAG))
4474 if (SDValue NewSel = foldBinOpIntoSelect(N))
4477 if (SDValue V = visitUDIVLike(N0, N1, N)) {
4478 // If the corresponding remainder node exists, update its users with
4479 // (Dividend - (Quotient * Divisor).
4480 if (SDNode *RemNode = DAG.getNodeIfExists(ISD::UREM, N->getVTList(),
4482 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, V, N1);
4483 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
4484 AddToWorklist(Mul.getNode());
4485 AddToWorklist(Sub.getNode());
4486 CombineTo(RemNode, Sub);
4491 // sdiv, srem -> sdivrem
4492 // If the divisor is constant, then return DIVREM only if isIntDivCheap() is
4493 // true. Otherwise, we break the simplification logic in visitREM().
4494 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4495 if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
4496 if (SDValue DivRem = useDivRem(N))
4502 SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
4504 EVT VT = N->getValueType(0);
4506 // fold (udiv x, (1 << c)) -> x >>u c
4507 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
4508 DAG.isKnownToBeAPowerOfTwo(N1)) {
4509 SDValue LogBase2 = BuildLogBase2(N1, DL);
4510 AddToWorklist(LogBase2.getNode());
4512 EVT ShiftVT = getShiftAmountTy(N0.getValueType());
4513 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
4514 AddToWorklist(Trunc.getNode());
4515 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
4518 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
4519 if (N1.getOpcode() == ISD::SHL) {
4520 SDValue N10 = N1.getOperand(0);
4521 if (isConstantOrConstantVector(N10, /*NoOpaques*/ true) &&
4522 DAG.isKnownToBeAPowerOfTwo(N10)) {
4523 SDValue LogBase2 = BuildLogBase2(N10, DL);
4524 AddToWorklist(LogBase2.getNode());
4526 EVT ADDVT = N1.getOperand(1).getValueType();
4527 SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ADDVT);
4528 AddToWorklist(Trunc.getNode());
4529 SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT, N1.getOperand(1), Trunc);
4530 AddToWorklist(Add.getNode());
4531 return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
4535 // fold (udiv x, c) -> alternate
4536 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4537 if (isConstantOrConstantVector(N1) &&
4538 !TLI.isIntDivCheap(N->getValueType(0), Attr))
4539 if (SDValue Op = BuildUDIV(N))
4545 SDValue DAGCombiner::buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N) {
4546 if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1) &&
4547 !DAG.doesNodeExist(ISD::SDIV, N->getVTList(), {N0, N1})) {
4548 // Target-specific implementation of srem x, pow2.
4549 if (SDValue Res = BuildSREMPow2(N))
4555 // handles ISD::SREM and ISD::UREM
4556 SDValue DAGCombiner::visitREM(SDNode *N) {
4557 unsigned Opcode = N->getOpcode();
4558 SDValue N0 = N->getOperand(0);
4559 SDValue N1 = N->getOperand(1);
4560 EVT VT = N->getValueType(0);
4561 EVT CCVT = getSetCCResultType(VT);
4563 bool isSigned = (Opcode == ISD::SREM);
4566 // fold (rem c1, c2) -> c1%c2
4567 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
4570 // fold (urem X, -1) -> select(FX == -1, 0, FX)
4571 // Freeze the numerator to avoid a miscompile with an undefined value.
4572 if (!isSigned && llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false)) {
4573 SDValue F0 = DAG.getFreeze(N0);
4574 SDValue EqualsNeg1 = DAG.getSetCC(DL, CCVT, F0, N1, ISD::SETEQ);
4575 return DAG.getSelect(DL, VT, EqualsNeg1, DAG.getConstant(0, DL, VT), F0);
4578 if (SDValue V = simplifyDivRem(N, DAG))
4581 if (SDValue NewSel = foldBinOpIntoSelect(N))
4585 // If we know the sign bits of both operands are zero, strength reduce to a
4586 // urem instead. Handles (X & 0x0FFFFFFF) %s 16 -> X&15
4587 if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
4588 return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
4590 if (DAG.isKnownToBeAPowerOfTwo(N1)) {
4591 // fold (urem x, pow2) -> (and x, pow2-1)
4592 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
4593 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
4594 AddToWorklist(Add.getNode());
4595 return DAG.getNode(ISD::AND, DL, VT, N0, Add);
4597 // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
4598 // fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
4599 // TODO: We should sink the following into isKnownToBePowerOfTwo
4600 // using a OrZero parameter analogous to our handling in ValueTracking.
4601 if ((N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) &&
4602 DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
4603 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
4604 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
4605 AddToWorklist(Add.getNode());
4606 return DAG.getNode(ISD::AND, DL, VT, N0, Add);
4610 AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
4612 // If X/C can be simplified by the division-by-constant logic, lower
4613 // X%C to the equivalent of X-X/C*C.
4614 // Reuse the SDIVLike/UDIVLike combines - to avoid mangling nodes, the
4615 // speculative DIV must not cause a DIVREM conversion. We guard against this
4616 // by skipping the simplification if isIntDivCheap(). When div is not cheap,
4617 // combine will not return a DIVREM. Regardless, checking cheapness here
4618 // makes sense since the simplification results in fatter code.
4619 if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
4621 // check if we can build faster implementation for srem
4622 if (SDValue OptimizedRem = buildOptimizedSREM(N0, N1, N))
4623 return OptimizedRem;
4626 SDValue OptimizedDiv =
4627 isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
4628 if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
4629 // If the equivalent Div node also exists, update its users.
4630 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
4631 if (SDNode *DivNode = DAG.getNodeIfExists(DivOpcode, N->getVTList(),
4633 CombineTo(DivNode, OptimizedDiv);
4634 SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
4635 SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
4636 AddToWorklist(OptimizedDiv.getNode());
4637 AddToWorklist(Mul.getNode());
4642 // sdiv, srem -> sdivrem
4643 if (SDValue DivRem = useDivRem(N))
4644 return DivRem.getValue(1);
4649 SDValue DAGCombiner::visitMULHS(SDNode *N) {
4650 SDValue N0 = N->getOperand(0);
4651 SDValue N1 = N->getOperand(1);
4652 EVT VT = N->getValueType(0);
4655 // fold (mulhs c1, c2)
4656 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHS, DL, VT, {N0, N1}))
4659 // canonicalize constant to RHS.
4660 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4661 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4662 return DAG.getNode(ISD::MULHS, DL, N->getVTList(), N1, N0);
4664 if (VT.isVector()) {
4665 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4668 // fold (mulhs x, 0) -> 0
4669 // do not return N1, because undef node may exist.
4670 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
4671 return DAG.getConstant(0, DL, VT);
4674 // fold (mulhs x, 0) -> 0
4675 if (isNullConstant(N1))
4678 // fold (mulhs x, 1) -> (sra x, size(x)-1)
4679 if (isOneConstant(N1))
4680 return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
4681 DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
4682 getShiftAmountTy(N0.getValueType())));
4684 // fold (mulhs x, undef) -> 0
4685 if (N0.isUndef() || N1.isUndef())
4686 return DAG.getConstant(0, DL, VT);
4688 // If the type twice as wide is legal, transform the mulhs to a wider multiply
4690 if (!TLI.isOperationLegalOrCustom(ISD::MULHS, VT) && VT.isSimple() &&
4692 MVT Simple = VT.getSimpleVT();
4693 unsigned SimpleSize = Simple.getSizeInBits();
4694 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4695 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4696 N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
4697 N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
4698 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
4699 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
4700 DAG.getConstant(SimpleSize, DL,
4701 getShiftAmountTy(N1.getValueType())));
4702 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
4709 SDValue DAGCombiner::visitMULHU(SDNode *N) {
4710 SDValue N0 = N->getOperand(0);
4711 SDValue N1 = N->getOperand(1);
4712 EVT VT = N->getValueType(0);
4715 // fold (mulhu c1, c2)
4716 if (SDValue C = DAG.FoldConstantArithmetic(ISD::MULHU, DL, VT, {N0, N1}))
4719 // canonicalize constant to RHS.
4720 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4721 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4722 return DAG.getNode(ISD::MULHU, DL, N->getVTList(), N1, N0);
4724 if (VT.isVector()) {
4725 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4728 // fold (mulhu x, 0) -> 0
4729 // do not return N1, because undef node may exist.
4730 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
4731 return DAG.getConstant(0, DL, VT);
4734 // fold (mulhu x, 0) -> 0
4735 if (isNullConstant(N1))
4738 // fold (mulhu x, 1) -> 0
4739 if (isOneConstant(N1))
4740 return DAG.getConstant(0, DL, N0.getValueType());
4742 // fold (mulhu x, undef) -> 0
4743 if (N0.isUndef() || N1.isUndef())
4744 return DAG.getConstant(0, DL, VT);
4746 // fold (mulhu x, (1 << c)) -> x >> (bitwidth - c)
4747 if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
4748 DAG.isKnownToBeAPowerOfTwo(N1) && hasOperation(ISD::SRL, VT)) {
4749 unsigned NumEltBits = VT.getScalarSizeInBits();
4750 SDValue LogBase2 = BuildLogBase2(N1, DL);
4751 SDValue SRLAmt = DAG.getNode(
4752 ISD::SUB, DL, VT, DAG.getConstant(NumEltBits, DL, VT), LogBase2);
4753 EVT ShiftVT = getShiftAmountTy(N0.getValueType());
4754 SDValue Trunc = DAG.getZExtOrTrunc(SRLAmt, DL, ShiftVT);
4755 return DAG.getNode(ISD::SRL, DL, VT, N0, Trunc);
4758 // If the type twice as wide is legal, transform the mulhu to a wider multiply
4760 if (!TLI.isOperationLegalOrCustom(ISD::MULHU, VT) && VT.isSimple() &&
4762 MVT Simple = VT.getSimpleVT();
4763 unsigned SimpleSize = Simple.getSizeInBits();
4764 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4765 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4766 N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
4767 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
4768 N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
4769 N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
4770 DAG.getConstant(SimpleSize, DL,
4771 getShiftAmountTy(N1.getValueType())));
4772 return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
4776 // Simplify the operands using demanded-bits information.
4777 // We don't have demanded bits support for MULHU so this just enables constant
4778 // folding based on known bits.
4779 if (SimplifyDemandedBits(SDValue(N, 0)))
4780 return SDValue(N, 0);
4785 SDValue DAGCombiner::visitAVG(SDNode *N) {
4786 unsigned Opcode = N->getOpcode();
4787 SDValue N0 = N->getOperand(0);
4788 SDValue N1 = N->getOperand(1);
4789 EVT VT = N->getValueType(0);
4792 // fold (avg c1, c2)
4793 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
4796 // canonicalize constant to RHS.
4797 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4798 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4799 return DAG.getNode(Opcode, DL, N->getVTList(), N1, N0);
4801 if (VT.isVector()) {
4802 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
4805 // fold (avgfloor x, 0) -> x >> 1
4806 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
4807 if (Opcode == ISD::AVGFLOORS)
4808 return DAG.getNode(ISD::SRA, DL, VT, N0, DAG.getConstant(1, DL, VT));
4809 if (Opcode == ISD::AVGFLOORU)
4810 return DAG.getNode(ISD::SRL, DL, VT, N0, DAG.getConstant(1, DL, VT));
4814 // fold (avg x, undef) -> x
4820 // TODO If we use avg for scalars anywhere, we can add (avgfl x, 0) -> x >> 1
4825 /// Perform optimizations common to nodes that compute two values. LoOp and HiOp
4826 /// give the opcodes for the two computations that are being performed. Return
4827 /// true if a simplification was made.
4828 SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
4830 // If the high half is not needed, just compute the low half.
4831 bool HiExists = N->hasAnyUseOfValue(1);
4832 if (!HiExists && (!LegalOperations ||
4833 TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
4834 SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
4835 return CombineTo(N, Res, Res);
4838 // If the low half is not needed, just compute the high half.
4839 bool LoExists = N->hasAnyUseOfValue(0);
4840 if (!LoExists && (!LegalOperations ||
4841 TLI.isOperationLegalOrCustom(HiOp, N->getValueType(1)))) {
4842 SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
4843 return CombineTo(N, Res, Res);
4846 // If both halves are used, return as it is.
4847 if (LoExists && HiExists)
4850 // If the two computed results can be simplified separately, separate them.
4852 SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
4853 AddToWorklist(Lo.getNode());
4854 SDValue LoOpt = combine(Lo.getNode());
4855 if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
4856 (!LegalOperations ||
4857 TLI.isOperationLegalOrCustom(LoOpt.getOpcode(), LoOpt.getValueType())))
4858 return CombineTo(N, LoOpt, LoOpt);
4862 SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
4863 AddToWorklist(Hi.getNode());
4864 SDValue HiOpt = combine(Hi.getNode());
4865 if (HiOpt.getNode() && HiOpt != Hi &&
4866 (!LegalOperations ||
4867 TLI.isOperationLegalOrCustom(HiOpt.getOpcode(), HiOpt.getValueType())))
4868 return CombineTo(N, HiOpt, HiOpt);
4874 SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
4875 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
4878 SDValue N0 = N->getOperand(0);
4879 SDValue N1 = N->getOperand(1);
4880 EVT VT = N->getValueType(0);
4883 // canonicalize constant to RHS (vector doesn't have to splat)
4884 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4885 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4886 return DAG.getNode(ISD::SMUL_LOHI, DL, N->getVTList(), N1, N0);
4888 // If the type is twice as wide is legal, transform the mulhu to a wider
4889 // multiply plus a shift.
4890 if (VT.isSimple() && !VT.isVector()) {
4891 MVT Simple = VT.getSimpleVT();
4892 unsigned SimpleSize = Simple.getSizeInBits();
4893 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4894 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4895 SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
4896 SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
4897 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
4898 // Compute the high part as N1.
4899 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
4900 DAG.getConstant(SimpleSize, DL,
4901 getShiftAmountTy(Lo.getValueType())));
4902 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
4903 // Compute the low part as N0.
4904 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
4905 return CombineTo(N, Lo, Hi);
4912 SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
4913 if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
4916 SDValue N0 = N->getOperand(0);
4917 SDValue N1 = N->getOperand(1);
4918 EVT VT = N->getValueType(0);
4921 // canonicalize constant to RHS (vector doesn't have to splat)
4922 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4923 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4924 return DAG.getNode(ISD::UMUL_LOHI, DL, N->getVTList(), N1, N0);
4926 // (umul_lohi N0, 0) -> (0, 0)
4927 if (isNullConstant(N1)) {
4928 SDValue Zero = DAG.getConstant(0, DL, VT);
4929 return CombineTo(N, Zero, Zero);
4932 // (umul_lohi N0, 1) -> (N0, 0)
4933 if (isOneConstant(N1)) {
4934 SDValue Zero = DAG.getConstant(0, DL, VT);
4935 return CombineTo(N, N0, Zero);
4938 // If the type is twice as wide is legal, transform the mulhu to a wider
4939 // multiply plus a shift.
4940 if (VT.isSimple() && !VT.isVector()) {
4941 MVT Simple = VT.getSimpleVT();
4942 unsigned SimpleSize = Simple.getSizeInBits();
4943 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
4944 if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
4945 SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
4946 SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
4947 Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
4948 // Compute the high part as N1.
4949 Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
4950 DAG.getConstant(SimpleSize, DL,
4951 getShiftAmountTy(Lo.getValueType())));
4952 Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
4953 // Compute the low part as N0.
4954 Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
4955 return CombineTo(N, Lo, Hi);
4962 SDValue DAGCombiner::visitMULO(SDNode *N) {
4963 SDValue N0 = N->getOperand(0);
4964 SDValue N1 = N->getOperand(1);
4965 EVT VT = N0.getValueType();
4966 bool IsSigned = (ISD::SMULO == N->getOpcode());
4968 EVT CarryVT = N->getValueType(1);
4971 ConstantSDNode *N0C = isConstOrConstSplat(N0);
4972 ConstantSDNode *N1C = isConstOrConstSplat(N1);
4974 // fold operation with constant operands.
4975 // TODO: Move this to FoldConstantArithmetic when it supports nodes with
4976 // multiple results.
4980 IsSigned ? N0C->getAPIntValue().smul_ov(N1C->getAPIntValue(), Overflow)
4981 : N0C->getAPIntValue().umul_ov(N1C->getAPIntValue(), Overflow);
4982 return CombineTo(N, DAG.getConstant(Result, DL, VT),
4983 DAG.getBoolConstant(Overflow, DL, CarryVT, CarryVT));
4986 // canonicalize constant to RHS.
4987 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
4988 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
4989 return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
4991 // fold (mulo x, 0) -> 0 + no carry out
4992 if (isNullOrNullSplat(N1))
4993 return CombineTo(N, DAG.getConstant(0, DL, VT),
4994 DAG.getConstant(0, DL, CarryVT));
4996 // (mulo x, 2) -> (addo x, x)
4997 // FIXME: This needs a freeze.
4998 if (N1C && N1C->getAPIntValue() == 2 &&
4999 (!IsSigned || VT.getScalarSizeInBits() > 2))
5000 return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
5001 N->getVTList(), N0, N0);
5004 // A 1 bit SMULO overflows if both inputs are 1.
5005 if (VT.getScalarSizeInBits() == 1) {
5006 SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, N1);
5007 return CombineTo(N, And,
5008 DAG.getSetCC(DL, CarryVT, And,
5009 DAG.getConstant(0, DL, VT), ISD::SETNE));
5012 // Multiplying n * m significant bits yields a result of n + m significant
5013 // bits. If the total number of significant bits does not exceed the
5014 // result bit width (minus 1), there is no overflow.
5015 unsigned SignBits = DAG.ComputeNumSignBits(N0);
5017 SignBits += DAG.ComputeNumSignBits(N1);
5018 if (SignBits > VT.getScalarSizeInBits() + 1)
5019 return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
5020 DAG.getConstant(0, DL, CarryVT));
5022 KnownBits N1Known = DAG.computeKnownBits(N1);
5023 KnownBits N0Known = DAG.computeKnownBits(N0);
5025 (void)N0Known.getMaxValue().umul_ov(N1Known.getMaxValue(), Overflow);
5027 return CombineTo(N, DAG.getNode(ISD::MUL, DL, VT, N0, N1),
5028 DAG.getConstant(0, DL, CarryVT));
5034 // Function to calculate whether the Min/Max pair of SDNodes (potentially
5035 // swapped around) make a signed saturate pattern, clamping to between a signed
5036 // saturate of -2^(BW-1) and 2^(BW-1)-1, or an unsigned saturate of 0 and 2^BW.
5037 // Returns the node being clamped and the bitwidth of the clamp in BW. Should
5038 // work with both SMIN/SMAX nodes and setcc/select combo. The operands are the
5039 // same as SimplifySelectCC. N0<N1 ? N2 : N3.
5040 static SDValue isSaturatingMinMax(SDValue N0, SDValue N1, SDValue N2,
5041 SDValue N3, ISD::CondCode CC, unsigned &BW,
5043 auto isSignedMinMax = [&](SDValue N0, SDValue N1, SDValue N2, SDValue N3,
5045 // The compare and select operand should be the same or the select operands
5046 // should be truncated versions of the comparison.
5047 if (N0 != N2 && (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0)))
5049 // The constants need to be the same or a truncated version of each other.
5050 ConstantSDNode *N1C = isConstOrConstSplat(N1);
5051 ConstantSDNode *N3C = isConstOrConstSplat(N3);
5054 const APInt &C1 = N1C->getAPIntValue();
5055 const APInt &C2 = N3C->getAPIntValue();
5056 if (C1.getBitWidth() < C2.getBitWidth() || C1 != C2.sext(C1.getBitWidth()))
5058 return CC == ISD::SETLT ? ISD::SMIN : (CC == ISD::SETGT ? ISD::SMAX : 0);
5061 // Check the initial value is a SMIN/SMAX equivalent.
5062 unsigned Opcode0 = isSignedMinMax(N0, N1, N2, N3, CC);
5066 SDValue N00, N01, N02, N03;
5068 switch (N0.getOpcode()) {
5071 N00 = N02 = N0.getOperand(0);
5072 N01 = N03 = N0.getOperand(1);
5073 N0CC = N0.getOpcode() == ISD::SMIN ? ISD::SETLT : ISD::SETGT;
5075 case ISD::SELECT_CC:
5076 N00 = N0.getOperand(0);
5077 N01 = N0.getOperand(1);
5078 N02 = N0.getOperand(2);
5079 N03 = N0.getOperand(3);
5080 N0CC = cast<CondCodeSDNode>(N0.getOperand(4))->get();
5084 if (N0.getOperand(0).getOpcode() != ISD::SETCC)
5086 N00 = N0.getOperand(0).getOperand(0);
5087 N01 = N0.getOperand(0).getOperand(1);
5088 N02 = N0.getOperand(1);
5089 N03 = N0.getOperand(2);
5090 N0CC = cast<CondCodeSDNode>(N0.getOperand(0).getOperand(2))->get();
5096 unsigned Opcode1 = isSignedMinMax(N00, N01, N02, N03, N0CC);
5097 if (!Opcode1 || Opcode0 == Opcode1)
5100 ConstantSDNode *MinCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N1 : N01);
5101 ConstantSDNode *MaxCOp = isConstOrConstSplat(Opcode0 == ISD::SMIN ? N01 : N1);
5102 if (!MinCOp || !MaxCOp || MinCOp->getValueType(0) != MaxCOp->getValueType(0))
5105 const APInt &MinC = MinCOp->getAPIntValue();
5106 const APInt &MaxC = MaxCOp->getAPIntValue();
5107 APInt MinCPlus1 = MinC + 1;
5108 if (-MaxC == MinCPlus1 && MinCPlus1.isPowerOf2()) {
5109 BW = MinCPlus1.exactLogBase2() + 1;
5114 if (MaxC == 0 && MinCPlus1.isPowerOf2()) {
5115 BW = MinCPlus1.exactLogBase2();
5123 static SDValue PerformMinMaxFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
5124 SDValue N3, ISD::CondCode CC,
5125 SelectionDAG &DAG) {
5128 SDValue Fp = isSaturatingMinMax(N0, N1, N2, N3, CC, BW, Unsigned);
5129 if (!Fp || Fp.getOpcode() != ISD::FP_TO_SINT)
5131 EVT FPVT = Fp.getOperand(0).getValueType();
5132 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
5133 if (FPVT.isVector())
5134 NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
5135 FPVT.getVectorElementCount());
5136 unsigned NewOpc = Unsigned ? ISD::FP_TO_UINT_SAT : ISD::FP_TO_SINT_SAT;
5137 if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(NewOpc, FPVT, NewVT))
5140 SDValue Sat = DAG.getNode(NewOpc, DL, NewVT, Fp.getOperand(0),
5141 DAG.getValueType(NewVT.getScalarType()));
5142 return Unsigned ? DAG.getZExtOrTrunc(Sat, DL, N2->getValueType(0))
5143 : DAG.getSExtOrTrunc(Sat, DL, N2->getValueType(0));
5146 static SDValue PerformUMinFpToSatCombine(SDValue N0, SDValue N1, SDValue N2,
5147 SDValue N3, ISD::CondCode CC,
5148 SelectionDAG &DAG) {
5149 // We are looking for UMIN(FPTOUI(X), (2^n)-1), which may have come via a
5150 // select/vselect/select_cc. The two operands pairs for the select (N2/N3) may
5151 // be truncated versions of the the setcc (N0/N1).
5153 (N2.getOpcode() != ISD::TRUNCATE || N0 != N2.getOperand(0))) ||
5154 N0.getOpcode() != ISD::FP_TO_UINT || CC != ISD::SETULT)
5156 ConstantSDNode *N1C = isConstOrConstSplat(N1);
5157 ConstantSDNode *N3C = isConstOrConstSplat(N3);
5160 const APInt &C1 = N1C->getAPIntValue();
5161 const APInt &C3 = N3C->getAPIntValue();
5162 if (!(C1 + 1).isPowerOf2() || C1.getBitWidth() < C3.getBitWidth() ||
5163 C1 != C3.zext(C1.getBitWidth()))
5166 unsigned BW = (C1 + 1).exactLogBase2();
5167 EVT FPVT = N0.getOperand(0).getValueType();
5168 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), BW);
5169 if (FPVT.isVector())
5170 NewVT = EVT::getVectorVT(*DAG.getContext(), NewVT,
5171 FPVT.getVectorElementCount());
5172 if (!DAG.getTargetLoweringInfo().shouldConvertFpToSat(ISD::FP_TO_UINT_SAT,
5177 DAG.getNode(ISD::FP_TO_UINT_SAT, SDLoc(N0), NewVT, N0.getOperand(0),
5178 DAG.getValueType(NewVT.getScalarType()));
5179 return DAG.getZExtOrTrunc(Sat, SDLoc(N0), N3.getValueType());
5182 SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
5183 SDValue N0 = N->getOperand(0);
5184 SDValue N1 = N->getOperand(1);
5185 EVT VT = N0.getValueType();
5186 unsigned Opcode = N->getOpcode();
5189 // fold operation with constant operands.
5190 if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
5193 // If the operands are the same, this is a no-op.
5197 // canonicalize constant to RHS
5198 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
5199 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
5200 return DAG.getNode(Opcode, DL, VT, N1, N0);
5204 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
5207 // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
5208 // Only do this if the current op isn't legal and the flipped is.
5209 if (!TLI.isOperationLegal(Opcode, VT) &&
5210 (N0.isUndef() || DAG.SignBitIsZero(N0)) &&
5211 (N1.isUndef() || DAG.SignBitIsZero(N1))) {
5214 case ISD::SMIN: AltOpcode = ISD::UMIN; break;
5215 case ISD::SMAX: AltOpcode = ISD::UMAX; break;
5216 case ISD::UMIN: AltOpcode = ISD::SMIN; break;
5217 case ISD::UMAX: AltOpcode = ISD::SMAX; break;
5218 default: llvm_unreachable("Unknown MINMAX opcode");
5220 if (TLI.isOperationLegal(AltOpcode, VT))
5221 return DAG.getNode(AltOpcode, DL, VT, N0, N1);
5224 if (Opcode == ISD::SMIN || Opcode == ISD::SMAX)
5225 if (SDValue S = PerformMinMaxFpToSatCombine(
5226 N0, N1, N0, N1, Opcode == ISD::SMIN ? ISD::SETLT : ISD::SETGT, DAG))
5228 if (Opcode == ISD::UMIN)
5229 if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N0, N1, ISD::SETULT, DAG))
5232 // Simplify the operands using demanded-bits information.
5233 if (SimplifyDemandedBits(SDValue(N, 0)))
5234 return SDValue(N, 0);
5239 /// If this is a bitwise logic instruction and both operands have the same
5240 /// opcode, try to sink the other opcode after the logic instruction.
5241 SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
5242 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
5243 EVT VT = N0.getValueType();
5244 unsigned LogicOpcode = N->getOpcode();
5245 unsigned HandOpcode = N0.getOpcode();
5246 assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR ||
5247 LogicOpcode == ISD::XOR) && "Expected logic opcode");
5248 assert(HandOpcode == N1.getOpcode() && "Bad input!");
5250 // Bail early if none of these transforms apply.
5251 if (N0.getNumOperands() == 0)
5254 // FIXME: We should check number of uses of the operands to not increase
5255 // the instruction count for all transforms.
5257 // Handle size-changing casts.
5258 SDValue X = N0.getOperand(0);
5259 SDValue Y = N1.getOperand(0);
5260 EVT XVT = X.getValueType();
5262 if (HandOpcode == ISD::ANY_EXTEND || HandOpcode == ISD::ZERO_EXTEND ||
5263 HandOpcode == ISD::SIGN_EXTEND) {
5264 // If both operands have other uses, this transform would create extra
5265 // instructions without eliminating anything.
5266 if (!N0.hasOneUse() && !N1.hasOneUse())
5268 // We need matching integer source types.
5269 if (XVT != Y.getValueType())
5271 // Don't create an illegal op during or after legalization. Don't ever
5272 // create an unsupported vector op.
5273 if ((VT.isVector() || LegalOperations) &&
5274 !TLI.isOperationLegalOrCustom(LogicOpcode, XVT))
5276 // Avoid infinite looping with PromoteIntBinOp.
5277 // TODO: Should we apply desirable/legal constraints to all opcodes?
5278 if (HandOpcode == ISD::ANY_EXTEND && LegalTypes &&
5279 !TLI.isTypeDesirableForOp(LogicOpcode, XVT))
5281 // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
5282 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5283 return DAG.getNode(HandOpcode, DL, VT, Logic);
5286 // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
5287 if (HandOpcode == ISD::TRUNCATE) {
5288 // If both operands have other uses, this transform would create extra
5289 // instructions without eliminating anything.
5290 if (!N0.hasOneUse() && !N1.hasOneUse())
5292 // We need matching source types.
5293 if (XVT != Y.getValueType())
5295 // Don't create an illegal op during or after legalization.
5296 if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
5298 // Be extra careful sinking truncate. If it's free, there's no benefit in
5299 // widening a binop. Also, don't create a logic op on an illegal type.
5300 if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
5302 if (!TLI.isTypeLegal(XVT))
5304 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5305 return DAG.getNode(HandOpcode, DL, VT, Logic);
5308 // For binops SHL/SRL/SRA/AND:
5309 // logic_op (OP x, z), (OP y, z) --> OP (logic_op x, y), z
5310 if ((HandOpcode == ISD::SHL || HandOpcode == ISD::SRL ||
5311 HandOpcode == ISD::SRA || HandOpcode == ISD::AND) &&
5312 N0.getOperand(1) == N1.getOperand(1)) {
5313 // If either operand has other uses, this transform is not an improvement.
5314 if (!N0.hasOneUse() || !N1.hasOneUse())
5316 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5317 return DAG.getNode(HandOpcode, DL, VT, Logic, N0.getOperand(1));
5320 // Unary ops: logic_op (bswap x), (bswap y) --> bswap (logic_op x, y)
5321 if (HandOpcode == ISD::BSWAP) {
5322 // If either operand has other uses, this transform is not an improvement.
5323 if (!N0.hasOneUse() || !N1.hasOneUse())
5325 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5326 return DAG.getNode(HandOpcode, DL, VT, Logic);
5329 // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
5330 // Only perform this optimization up until type legalization, before
5331 // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
5332 // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
5333 // we don't want to undo this promotion.
5334 // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
5336 if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) &&
5337 Level <= AfterLegalizeTypes) {
5338 // Input types must be integer and the same.
5339 if (XVT.isInteger() && XVT == Y.getValueType() &&
5340 !(VT.isVector() && TLI.isTypeLegal(VT) &&
5341 !XVT.isVector() && !TLI.isTypeLegal(XVT))) {
5342 SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
5343 return DAG.getNode(HandOpcode, DL, VT, Logic);
5347 // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
5348 // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
5349 // If both shuffles use the same mask, and both shuffle within a single
5350 // vector, then it is worthwhile to move the swizzle after the operation.
5351 // The type-legalizer generates this pattern when loading illegal
5352 // vector types from memory. In many cases this allows additional shuffle
5354 // There are other cases where moving the shuffle after the xor/and/or
5355 // is profitable even if shuffles don't perform a swizzle.
5356 // If both shuffles use the same mask, and both shuffles have the same first
5357 // or second operand, then it might still be profitable to move the shuffle
5358 // after the xor/and/or operation.
5359 if (HandOpcode == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
5360 auto *SVN0 = cast<ShuffleVectorSDNode>(N0);
5361 auto *SVN1 = cast<ShuffleVectorSDNode>(N1);
5362 assert(X.getValueType() == Y.getValueType() &&
5363 "Inputs to shuffles are not the same type");
5365 // Check that both shuffles use the same mask. The masks are known to be of
5366 // the same length because the result vector type is the same.
5367 // Check also that shuffles have only one use to avoid introducing extra
5369 if (!SVN0->hasOneUse() || !SVN1->hasOneUse() ||
5370 !SVN0->getMask().equals(SVN1->getMask()))
5373 // Don't try to fold this node if it requires introducing a
5374 // build vector of all zeros that might be illegal at this stage.
5375 SDValue ShOp = N0.getOperand(1);
5376 if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
5377 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
5379 // (logic_op (shuf (A, C), shuf (B, C))) --> shuf (logic_op (A, B), C)
5380 if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
5381 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT,
5382 N0.getOperand(0), N1.getOperand(0));
5383 return DAG.getVectorShuffle(VT, DL, Logic, ShOp, SVN0->getMask());
5386 // Don't try to fold this node if it requires introducing a
5387 // build vector of all zeros that might be illegal at this stage.
5388 ShOp = N0.getOperand(0);
5389 if (LogicOpcode == ISD::XOR && !ShOp.isUndef())
5390 ShOp = tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
5392 // (logic_op (shuf (C, A), shuf (C, B))) --> shuf (C, logic_op (A, B))
5393 if (N0.getOperand(0) == N1.getOperand(0) && ShOp.getNode()) {
5394 SDValue Logic = DAG.getNode(LogicOpcode, DL, VT, N0.getOperand(1),
5396 return DAG.getVectorShuffle(VT, DL, ShOp, Logic, SVN0->getMask());
5403 /// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
5404 SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
5406 SDValue LL, LR, RL, RR, N0CC, N1CC;
5407 if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
5408 !isSetCCEquivalent(N1, RL, RR, N1CC))
5411 assert(N0.getValueType() == N1.getValueType() &&
5412 "Unexpected operand types for bitwise logic op");
5413 assert(LL.getValueType() == LR.getValueType() &&
5414 RL.getValueType() == RR.getValueType() &&
5415 "Unexpected operand types for setcc");
5417 // If we're here post-legalization or the logic op type is not i1, the logic
5418 // op type must match a setcc result type. Also, all folds require new
5419 // operations on the left and right operands, so those types must match.
5420 EVT VT = N0.getValueType();
5421 EVT OpVT = LL.getValueType();
5422 if (LegalOperations || VT.getScalarType() != MVT::i1)
5423 if (VT != getSetCCResultType(OpVT))
5425 if (OpVT != RL.getValueType())
5428 ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
5429 ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
5430 bool IsInteger = OpVT.isInteger();
5431 if (LR == RR && CC0 == CC1 && IsInteger) {
5432 bool IsZero = isNullOrNullSplat(LR);
5433 bool IsNeg1 = isAllOnesOrAllOnesSplat(LR);
5436 bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
5437 // All sign bits clear?
5438 bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
5440 bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
5441 // Any sign bits set?
5442 bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
5444 // (and (seteq X, 0), (seteq Y, 0)) --> (seteq (or X, Y), 0)
5445 // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
5446 // (or (setne X, 0), (setne Y, 0)) --> (setne (or X, Y), 0)
5447 // (or (setlt X, 0), (setlt Y, 0)) --> (setlt (or X, Y), 0)
5448 if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
5449 SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
5450 AddToWorklist(Or.getNode());
5451 return DAG.getSetCC(DL, VT, Or, LR, CC1);
5455 bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
5456 // All sign bits set?
5457 bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
5459 bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
5460 // Any sign bits clear?
5461 bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
5463 // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
5464 // (and (setlt X, 0), (setlt Y, 0)) --> (setlt (and X, Y), 0)
5465 // (or (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
5466 // (or (setgt X, -1), (setgt Y -1)) --> (setgt (and X, Y), -1)
5467 if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
5468 SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
5469 AddToWorklist(And.getNode());
5470 return DAG.getSetCC(DL, VT, And, LR, CC1);
5474 // TODO: What is the 'or' equivalent of this fold?
5475 // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
5476 if (IsAnd && LL == RL && CC0 == CC1 && OpVT.getScalarSizeInBits() > 1 &&
5477 IsInteger && CC0 == ISD::SETNE &&
5478 ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
5479 (isAllOnesConstant(LR) && isNullConstant(RR)))) {
5480 SDValue One = DAG.getConstant(1, DL, OpVT);
5481 SDValue Two = DAG.getConstant(2, DL, OpVT);
5482 SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
5483 AddToWorklist(Add.getNode());
5484 return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
5487 // Try more general transforms if the predicates match and the only user of
5488 // the compares is the 'and' or 'or'.
5489 if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
5490 N0.hasOneUse() && N1.hasOneUse()) {
5491 // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
5492 // or (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
5493 if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
5494 SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
5495 SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
5496 SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
5497 SDValue Zero = DAG.getConstant(0, DL, OpVT);
5498 return DAG.getSetCC(DL, VT, Or, Zero, CC1);
5501 // Turn compare of constants whose difference is 1 bit into add+and+setcc.
5502 if ((IsAnd && CC1 == ISD::SETNE) || (!IsAnd && CC1 == ISD::SETEQ)) {
5503 // Match a shared variable operand and 2 non-opaque constant operands.
5504 auto MatchDiffPow2 = [&](ConstantSDNode *C0, ConstantSDNode *C1) {
5505 // The difference of the constants must be a single bit.
5507 APIntOps::umax(C0->getAPIntValue(), C1->getAPIntValue());
5509 APIntOps::umin(C0->getAPIntValue(), C1->getAPIntValue());
5510 return !C0->isOpaque() && !C1->isOpaque() && (CMax - CMin).isPowerOf2();
5512 if (LL == RL && ISD::matchBinaryPredicate(LR, RR, MatchDiffPow2)) {
5513 // and/or (setcc X, CMax, ne), (setcc X, CMin, ne/eq) -->
5514 // setcc ((sub X, CMin), ~(CMax - CMin)), 0, ne/eq
5515 SDValue Max = DAG.getNode(ISD::UMAX, DL, OpVT, LR, RR);
5516 SDValue Min = DAG.getNode(ISD::UMIN, DL, OpVT, LR, RR);
5517 SDValue Offset = DAG.getNode(ISD::SUB, DL, OpVT, LL, Min);
5518 SDValue Diff = DAG.getNode(ISD::SUB, DL, OpVT, Max, Min);
5519 SDValue Mask = DAG.getNOT(DL, Diff, OpVT);
5520 SDValue And = DAG.getNode(ISD::AND, DL, OpVT, Offset, Mask);
5521 SDValue Zero = DAG.getConstant(0, DL, OpVT);
5522 return DAG.getSetCC(DL, VT, And, Zero, CC0);
5527 // Canonicalize equivalent operands to LL == RL.
5528 if (LL == RR && LR == RL) {
5529 CC1 = ISD::getSetCCSwappedOperands(CC1);
5533 // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
5534 // (or (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
5535 if (LL == RL && LR == RR) {
5536 ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, OpVT)
5537 : ISD::getSetCCOrOperation(CC0, CC1, OpVT);
5538 if (NewCC != ISD::SETCC_INVALID &&
5539 (!LegalOperations ||
5540 (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
5541 TLI.isOperationLegal(ISD::SETCC, OpVT))))
5542 return DAG.getSetCC(DL, VT, LL, LR, NewCC);
5548 /// This contains all DAGCombine rules which reduce two values combined by
5549 /// an And operation to a single value. This makes them reusable in the context
5550 /// of visitSELECT(). Rules involving constants are not included as
5551 /// visitSELECT() already handles those cases.
5552 SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
5553 EVT VT = N1.getValueType();
5556 // fold (and x, undef) -> 0
5557 if (N0.isUndef() || N1.isUndef())
5558 return DAG.getConstant(0, DL, VT);
5560 if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
5563 // TODO: Rewrite this to return a new 'AND' instead of using CombineTo.
5564 if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
5565 VT.getSizeInBits() <= 64 && N0->hasOneUse()) {
5566 if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
5567 if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
5568 // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
5569 // immediate for an add, but it is legal if its top c2 bits are set,
5570 // transform the ADD so the immediate doesn't need to be materialized
5572 APInt ADDC = ADDI->getAPIntValue();
5573 APInt SRLC = SRLI->getAPIntValue();
5574 if (ADDC.getMinSignedBits() <= 64 &&
5575 SRLC.ult(VT.getSizeInBits()) &&
5576 !TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
5577 APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
5578 SRLC.getZExtValue());
5579 if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
5581 if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
5584 DAG.getNode(ISD::ADD, DL0, VT,
5585 N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
5586 CombineTo(N0.getNode(), NewAdd);
5587 // Return N so it doesn't get rechecked!
5588 return SDValue(N, 0);
5596 // Reduce bit extract of low half of an integer to the narrower type.
5597 // (and (srl i64:x, K), KMask) ->
5598 // (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
5599 if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
5600 if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
5601 if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
5602 unsigned Size = VT.getSizeInBits();
5603 const APInt &AndMask = CAnd->getAPIntValue();
5604 unsigned ShiftBits = CShift->getZExtValue();
5606 // Bail out, this node will probably disappear anyway.
5610 unsigned MaskBits = AndMask.countTrailingOnes();
5611 EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
5613 if (AndMask.isMask() &&
5614 // Required bits must not span the two halves of the integer and
5615 // must fit in the half size type.
5616 (ShiftBits + MaskBits <= Size / 2) &&
5617 TLI.isNarrowingProfitable(VT, HalfVT) &&
5618 TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
5619 TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
5620 TLI.isTruncateFree(VT, HalfVT) &&
5621 TLI.isZExtFree(HalfVT, VT)) {
5622 // The isNarrowingProfitable is to avoid regressions on PPC and
5623 // AArch64 which match a few 64-bit bit insert / bit extract patterns
5624 // on downstream users of this. Those patterns could probably be
5625 // extended to handle extensions mixed in.
5628 assert(MaskBits <= Size);
5630 // Extracting the highest bit of the low half.
5631 EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
5632 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
5635 SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
5636 SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
5637 SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
5638 SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
5639 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
5648 bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
5649 EVT LoadResultTy, EVT &ExtVT) {
5650 if (!AndC->getAPIntValue().isMask())
5653 unsigned ActiveBits = AndC->getAPIntValue().countTrailingOnes();
5655 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
5656 EVT LoadedVT = LoadN->getMemoryVT();
5658 if (ExtVT == LoadedVT &&
5659 (!LegalOperations ||
5660 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
5661 // ZEXTLOAD will match without needing to change the size of the value being
5666 // Do not change the width of a volatile or atomic loads.
5667 if (!LoadN->isSimple())
5670 // Do not generate loads of non-round integer types since these can
5671 // be expensive (and would be wrong if the type is not byte sized).
5672 if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound())
5675 if (LegalOperations &&
5676 !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
5679 if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
5685 bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
5686 ISD::LoadExtType ExtType, EVT &MemVT,
5690 // Only allow byte offsets.
5694 // Do not generate loads of non-round integer types since these can
5695 // be expensive (and would be wrong if the type is not byte sized).
5696 if (!MemVT.isRound())
5699 // Don't change the width of a volatile or atomic loads.
5700 if (!LDST->isSimple())
5703 EVT LdStMemVT = LDST->getMemoryVT();
5705 // Bail out when changing the scalable property, since we can't be sure that
5706 // we're actually narrowing here.
5707 if (LdStMemVT.isScalableVector() != MemVT.isScalableVector())
5710 // Verify that we are actually reducing a load width here.
5711 if (LdStMemVT.bitsLT(MemVT))
5714 // Ensure that this isn't going to produce an unsupported memory access.
5716 assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
5717 const unsigned ByteShAmt = ShAmt / 8;
5718 const Align LDSTAlign = LDST->getAlign();
5719 const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
5720 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
5721 LDST->getAddressSpace(), NarrowAlign,
5722 LDST->getMemOperand()->getFlags()))
5726 // It's not possible to generate a constant of extended or untyped type.
5727 EVT PtrType = LDST->getBasePtr().getValueType();
5728 if (PtrType == MVT::Untyped || PtrType.isExtended())
5731 if (isa<LoadSDNode>(LDST)) {
5732 LoadSDNode *Load = cast<LoadSDNode>(LDST);
5733 // Don't transform one with multiple uses, this would require adding a new
5735 if (!SDValue(Load, 0).hasOneUse())
5738 if (LegalOperations &&
5739 !TLI.isLoadExtLegal(ExtType, Load->getValueType(0), MemVT))
5742 // For the transform to be legal, the load must produce only two values
5743 // (the value loaded and the chain). Don't transform a pre-increment
5744 // load, for example, which produces an extra value. Otherwise the
5745 // transformation is not equivalent, and the downstream logic to replace
5746 // uses gets things wrong.
5747 if (Load->getNumValues() > 2)
5750 // If the load that we're shrinking is an extload and we're not just
5751 // discarding the extension we can't simply shrink the load. Bail.
5752 // TODO: It would be possible to merge the extensions in some cases.
5753 if (Load->getExtensionType() != ISD::NON_EXTLOAD &&
5754 Load->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
5757 if (!TLI.shouldReduceLoadWidth(Load, ExtType, MemVT))
5760 assert(isa<StoreSDNode>(LDST) && "It is not a Load nor a Store SDNode");
5761 StoreSDNode *Store = cast<StoreSDNode>(LDST);
5762 // Can't write outside the original store
5763 if (Store->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits() + ShAmt)
5766 if (LegalOperations &&
5767 !TLI.isTruncStoreLegal(Store->getValue().getValueType(), MemVT))
5773 bool DAGCombiner::SearchForAndLoads(SDNode *N,
5774 SmallVectorImpl<LoadSDNode*> &Loads,
5775 SmallPtrSetImpl<SDNode*> &NodesWithConsts,
5776 ConstantSDNode *Mask,
5777 SDNode *&NodeToMask) {
5778 // Recursively search for the operands, looking for loads which can be
5780 for (SDValue Op : N->op_values()) {
5781 if (Op.getValueType().isVector())
5784 // Some constants may need fixing up later if they are too large.
5785 if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
5786 if ((N->getOpcode() == ISD::OR || N->getOpcode() == ISD::XOR) &&
5787 (Mask->getAPIntValue() & C->getAPIntValue()) != C->getAPIntValue())
5788 NodesWithConsts.insert(N);
5792 if (!Op.hasOneUse())
5795 switch(Op.getOpcode()) {
5797 auto *Load = cast<LoadSDNode>(Op);
5799 if (isAndLoadExtLoad(Mask, Load, Load->getValueType(0), ExtVT) &&
5800 isLegalNarrowLdSt(Load, ISD::ZEXTLOAD, ExtVT)) {
5802 // ZEXTLOAD is already small enough.
5803 if (Load->getExtensionType() == ISD::ZEXTLOAD &&
5804 ExtVT.bitsGE(Load->getMemoryVT()))
5807 // Use LE to convert equal sized loads to zext.
5808 if (ExtVT.bitsLE(Load->getMemoryVT()))
5809 Loads.push_back(Load);
5815 case ISD::ZERO_EXTEND:
5816 case ISD::AssertZext: {
5817 unsigned ActiveBits = Mask->getAPIntValue().countTrailingOnes();
5818 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
5819 EVT VT = Op.getOpcode() == ISD::AssertZext ?
5820 cast<VTSDNode>(Op.getOperand(1))->getVT() :
5821 Op.getOperand(0).getValueType();
5823 // We can accept extending nodes if the mask is wider or an equal
5824 // width to the original type.
5825 if (ExtVT.bitsGE(VT))
5832 if (!SearchForAndLoads(Op.getNode(), Loads, NodesWithConsts, Mask,
5838 // Allow one node which will masked along with any loads found.
5842 // Also ensure that the node to be masked only produces one data result.
5843 NodeToMask = Op.getNode();
5844 if (NodeToMask->getNumValues() > 1) {
5845 bool HasValue = false;
5846 for (unsigned i = 0, e = NodeToMask->getNumValues(); i < e; ++i) {
5847 MVT VT = SDValue(NodeToMask, i).getSimpleValueType();
5848 if (VT != MVT::Glue && VT != MVT::Other) {
5850 NodeToMask = nullptr;
5856 assert(HasValue && "Node to be masked has no data result?");
5862 bool DAGCombiner::BackwardsPropagateMask(SDNode *N) {
5863 auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(1));
5867 if (!Mask->getAPIntValue().isMask())
5870 // No need to do anything if the and directly uses a load.
5871 if (isa<LoadSDNode>(N->getOperand(0)))
5874 SmallVector<LoadSDNode*, 8> Loads;
5875 SmallPtrSet<SDNode*, 2> NodesWithConsts;
5876 SDNode *FixupNode = nullptr;
5877 if (SearchForAndLoads(N, Loads, NodesWithConsts, Mask, FixupNode)) {
5878 if (Loads.size() == 0)
5881 LLVM_DEBUG(dbgs() << "Backwards propagate AND: "; N->dump());
5882 SDValue MaskOp = N->getOperand(1);
5884 // If it exists, fixup the single node we allow in the tree that needs
5887 LLVM_DEBUG(dbgs() << "First, need to fix up: "; FixupNode->dump());
5888 SDValue And = DAG.getNode(ISD::AND, SDLoc(FixupNode),
5889 FixupNode->getValueType(0),
5890 SDValue(FixupNode, 0), MaskOp);
5891 DAG.ReplaceAllUsesOfValueWith(SDValue(FixupNode, 0), And);
5892 if (And.getOpcode() == ISD ::AND)
5893 DAG.UpdateNodeOperands(And.getNode(), SDValue(FixupNode, 0), MaskOp);
5896 // Narrow any constants that need it.
5897 for (auto *LogicN : NodesWithConsts) {
5898 SDValue Op0 = LogicN->getOperand(0);
5899 SDValue Op1 = LogicN->getOperand(1);
5901 if (isa<ConstantSDNode>(Op0))
5902 std::swap(Op0, Op1);
5904 SDValue And = DAG.getNode(ISD::AND, SDLoc(Op1), Op1.getValueType(),
5907 DAG.UpdateNodeOperands(LogicN, Op0, And);
5910 // Create narrow loads.
5911 for (auto *Load : Loads) {
5912 LLVM_DEBUG(dbgs() << "Propagate AND back to: "; Load->dump());
5913 SDValue And = DAG.getNode(ISD::AND, SDLoc(Load), Load->getValueType(0),
5914 SDValue(Load, 0), MaskOp);
5915 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), And);
5916 if (And.getOpcode() == ISD ::AND)
5918 DAG.UpdateNodeOperands(And.getNode(), SDValue(Load, 0), MaskOp), 0);
5919 SDValue NewLoad = reduceLoadWidth(And.getNode());
5921 "Shouldn't be masking the load if it can't be narrowed");
5922 CombineTo(Load, NewLoad, NewLoad.getValue(1));
5924 DAG.ReplaceAllUsesWith(N, N->getOperand(0).getNode());
5931 // x & (-1 'logical shift' y)
5933 // (x 'opposite logical shift' y) 'logical shift' y
5934 // if it is better for performance.
5935 SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
5936 assert(N->getOpcode() == ISD::AND);
5938 SDValue N0 = N->getOperand(0);
5939 SDValue N1 = N->getOperand(1);
5941 // Do we actually prefer shifts over mask?
5942 if (!TLI.shouldFoldMaskToVariableShiftPair(N0))
5945 // Try to match (-1 '[outer] logical shift' y)
5946 unsigned OuterShift;
5947 unsigned InnerShift; // The opposite direction to the OuterShift.
5948 SDValue Y; // Shift amount.
5949 auto matchMask = [&OuterShift, &InnerShift, &Y](SDValue M) -> bool {
5952 OuterShift = M->getOpcode();
5953 if (OuterShift == ISD::SHL)
5954 InnerShift = ISD::SRL;
5955 else if (OuterShift == ISD::SRL)
5956 InnerShift = ISD::SHL;
5959 if (!isAllOnesConstant(M->getOperand(0)))
5961 Y = M->getOperand(1);
5968 else if (matchMask(N0))
5974 EVT VT = N->getValueType(0);
5976 // tmp = x 'opposite logical shift' y
5977 SDValue T0 = DAG.getNode(InnerShift, DL, VT, X, Y);
5978 // ret = tmp 'logical shift' y
5979 SDValue T1 = DAG.getNode(OuterShift, DL, VT, T0, Y);
5984 /// Try to replace shift/logic that tests if a bit is clear with mask + setcc.
5985 /// For a target with a bit test, this is expected to become test + set and save
5986 /// at least 1 instruction.
5987 static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
5988 assert(And->getOpcode() == ISD::AND && "Expected an 'and' op");
5990 // This is probably not worthwhile without a supported type.
5991 EVT VT = And->getValueType(0);
5992 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5993 if (!TLI.isTypeLegal(VT))
5996 // Look through an optional extension.
5997 SDValue And0 = And->getOperand(0), And1 = And->getOperand(1);
5998 if (And0.getOpcode() == ISD::ANY_EXTEND && And0.hasOneUse())
5999 And0 = And0.getOperand(0);
6000 if (!isOneConstant(And1) || !And0.hasOneUse())
6005 // Attempt to find a 'not' op.
6006 // TODO: Should we favor test+set even without the 'not' op?
6007 bool FoundNot = false;
6008 if (isBitwiseNot(Src)) {
6010 Src = Src.getOperand(0);
6012 // Look though an optional truncation. The source operand may not be the
6013 // same type as the original 'and', but that is ok because we are masking
6014 // off everything but the low bit.
6015 if (Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse())
6016 Src = Src.getOperand(0);
6019 // Match a shift-right by constant.
6020 if (Src.getOpcode() != ISD::SRL || !Src.hasOneUse())
6023 // We might have looked through casts that make this transform invalid.
6024 // TODO: If the source type is wider than the result type, do the mask and
6025 // compare in the source type.
6026 unsigned VTBitWidth = VT.getScalarSizeInBits();
6027 SDValue ShiftAmt = Src.getOperand(1);
6028 auto *ShiftAmtC = dyn_cast<ConstantSDNode>(ShiftAmt);
6029 if (!ShiftAmtC || !ShiftAmtC->getAPIntValue().ult(VTBitWidth))
6032 // Set source to shift source.
6033 Src = Src.getOperand(0);
6035 // Try again to find a 'not' op.
6036 // TODO: Should we favor test+set even with two 'not' ops?
6038 if (!isBitwiseNot(Src))
6040 Src = Src.getOperand(0);
6043 if (!TLI.hasBitTest(Src, ShiftAmt))
6046 // Turn this into a bit-test pattern using mask op + setcc:
6047 // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
6048 // and (srl (not X), C)), 1 --> (and X, 1<<C) == 0
6050 SDValue X = DAG.getZExtOrTrunc(Src, DL, VT);
6051 EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
6052 SDValue Mask = DAG.getConstant(
6053 APInt::getOneBitSet(VTBitWidth, ShiftAmtC->getZExtValue()), DL, VT);
6054 SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask);
6055 SDValue Zero = DAG.getConstant(0, DL, VT);
6056 SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
6057 return DAG.getZExtOrTrunc(Setcc, DL, VT);
6060 /// For targets that support usubsat, match a bit-hack form of that operation
6061 /// that ends in 'and' and convert it.
6062 static SDValue foldAndToUsubsat(SDNode *N, SelectionDAG &DAG) {
6063 SDValue N0 = N->getOperand(0);
6064 SDValue N1 = N->getOperand(1);
6065 EVT VT = N1.getValueType();
6067 // Canonicalize SRA as operand 1.
6068 if (N0.getOpcode() == ISD::SRA)
6071 // xor/add with SMIN (signmask) are logically equivalent.
6072 if (N0.getOpcode() != ISD::XOR && N0.getOpcode() != ISD::ADD)
6075 if (N1.getOpcode() != ISD::SRA || !N0.hasOneUse() || !N1.hasOneUse() ||
6076 N0.getOperand(0) != N1.getOperand(0))
6079 unsigned BitWidth = VT.getScalarSizeInBits();
6080 ConstantSDNode *XorC = isConstOrConstSplat(N0.getOperand(1), true);
6081 ConstantSDNode *SraC = isConstOrConstSplat(N1.getOperand(1), true);
6082 if (!XorC || !XorC->getAPIntValue().isSignMask() ||
6083 !SraC || SraC->getAPIntValue() != BitWidth - 1)
6086 // (i8 X ^ 128) & (i8 X s>> 7) --> usubsat X, 128
6087 // (i8 X + 128) & (i8 X s>> 7) --> usubsat X, 128
6089 SDValue SignMask = DAG.getConstant(XorC->getAPIntValue(), DL, VT);
6090 return DAG.getNode(ISD::USUBSAT, DL, VT, N0.getOperand(0), SignMask);
6093 /// Given a bitwise logic operation N with a matching bitwise logic operand,
6094 /// fold a pattern where 2 of the source operands are identically shifted
6095 /// values. For example:
6096 /// ((X0 << Y) | Z) | (X1 << Y) --> ((X0 | X1) << Y) | Z
6097 static SDValue foldLogicOfShifts(SDNode *N, SDValue LogicOp, SDValue ShiftOp,
6098 SelectionDAG &DAG) {
6099 unsigned LogicOpcode = N->getOpcode();
6100 assert((LogicOpcode == ISD::AND || LogicOpcode == ISD::OR ||
6101 LogicOpcode == ISD::XOR)
6102 && "Expected bitwise logic operation");
6104 if (!LogicOp.hasOneUse() || !ShiftOp.hasOneUse())
6107 // Match another bitwise logic op and a shift.
6108 unsigned ShiftOpcode = ShiftOp.getOpcode();
6109 if (LogicOp.getOpcode() != LogicOpcode ||
6110 !(ShiftOpcode == ISD::SHL || ShiftOpcode == ISD::SRL ||
6111 ShiftOpcode == ISD::SRA))
6114 // Match another shift op inside the first logic operand. Handle both commuted
6116 // LOGIC (LOGIC (SH X0, Y), Z), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
6117 // LOGIC (LOGIC Z, (SH X0, Y)), (SH X1, Y) --> LOGIC (SH (LOGIC X0, X1), Y), Z
6118 SDValue X1 = ShiftOp.getOperand(0);
6119 SDValue Y = ShiftOp.getOperand(1);
6121 if (LogicOp.getOperand(0).getOpcode() == ShiftOpcode &&
6122 LogicOp.getOperand(0).getOperand(1) == Y) {
6123 X0 = LogicOp.getOperand(0).getOperand(0);
6124 Z = LogicOp.getOperand(1);
6125 } else if (LogicOp.getOperand(1).getOpcode() == ShiftOpcode &&
6126 LogicOp.getOperand(1).getOperand(1) == Y) {
6127 X0 = LogicOp.getOperand(1).getOperand(0);
6128 Z = LogicOp.getOperand(0);
6133 EVT VT = N->getValueType(0);
6135 SDValue LogicX = DAG.getNode(LogicOpcode, DL, VT, X0, X1);
6136 SDValue NewShift = DAG.getNode(ShiftOpcode, DL, VT, LogicX, Y);
6137 return DAG.getNode(LogicOpcode, DL, VT, NewShift, Z);
6140 SDValue DAGCombiner::visitAND(SDNode *N) {
6141 SDValue N0 = N->getOperand(0);
6142 SDValue N1 = N->getOperand(1);
6143 EVT VT = N1.getValueType();
6149 // fold (and c1, c2) -> c1&c2
6150 if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
6153 // canonicalize constant to RHS
6154 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
6155 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
6156 return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
6159 if (VT.isVector()) {
6160 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
6163 // fold (and x, 0) -> 0, vector edition
6164 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
6165 // do not return N1, because undef node may exist in N1
6166 return DAG.getConstant(APInt::getZero(N1.getScalarValueSizeInBits()),
6167 SDLoc(N), N1.getValueType());
6169 // fold (and x, -1) -> x, vector edition
6170 if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
6173 // fold (and (masked_load) (splat_vec (x, ...))) to zext_masked_load
6174 auto *MLoad = dyn_cast<MaskedLoadSDNode>(N0);
6175 ConstantSDNode *Splat = isConstOrConstSplat(N1, true, true);
6176 if (MLoad && MLoad->getExtensionType() == ISD::EXTLOAD && N0.hasOneUse() &&
6177 Splat && N1.hasOneUse()) {
6178 EVT LoadVT = MLoad->getMemoryVT();
6180 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT, LoadVT)) {
6181 // For this AND to be a zero extension of the masked load the elements
6182 // of the BuildVec must mask the bottom bits of the extended element
6184 uint64_t ElementSize =
6185 LoadVT.getVectorElementType().getScalarSizeInBits();
6186 if (Splat->getAPIntValue().isMask(ElementSize)) {
6187 return DAG.getMaskedLoad(
6188 ExtVT, SDLoc(N), MLoad->getChain(), MLoad->getBasePtr(),
6189 MLoad->getOffset(), MLoad->getMask(), MLoad->getPassThru(),
6190 LoadVT, MLoad->getMemOperand(), MLoad->getAddressingMode(),
6191 ISD::ZEXTLOAD, MLoad->isExpandingLoad());
6197 // fold (and x, -1) -> x
6198 if (isAllOnesConstant(N1))
6201 // if (and x, c) is known to be zero, return 0
6202 unsigned BitWidth = VT.getScalarSizeInBits();
6203 ConstantSDNode *N1C = isConstOrConstSplat(N1);
6204 if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(BitWidth)))
6205 return DAG.getConstant(0, SDLoc(N), VT);
6207 if (SDValue NewSel = foldBinOpIntoSelect(N))
6211 if (SDValue RAND = reassociateOps(ISD::AND, SDLoc(N), N0, N1, N->getFlags()))
6214 // Try to convert a constant mask AND into a shuffle clear mask.
6216 if (SDValue Shuffle = XformToShuffleWithZero(N))
6219 if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
6222 // fold (and (or x, C), D) -> D if (C & D) == D
6223 auto MatchSubset = [](ConstantSDNode *LHS, ConstantSDNode *RHS) {
6224 return RHS->getAPIntValue().isSubsetOf(LHS->getAPIntValue());
6226 if (N0.getOpcode() == ISD::OR &&
6227 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchSubset))
6229 // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
6230 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
6231 SDValue N0Op0 = N0.getOperand(0);
6232 APInt Mask = ~N1C->getAPIntValue();
6233 Mask = Mask.trunc(N0Op0.getScalarValueSizeInBits());
6234 if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
6235 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
6236 N0.getValueType(), N0Op0);
6238 // Replace uses of the AND with uses of the Zero extend node.
6241 // We actually want to replace all uses of the any_extend with the
6242 // zero_extend, to avoid duplicating things. This will later cause this
6243 // AND to be folded.
6244 CombineTo(N0.getNode(), Zext);
6245 return SDValue(N, 0); // Return N so it doesn't get rechecked!
6249 // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
6250 // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
6251 // already be zero by virtue of the width of the base type of the load.
6253 // the 'X' node here can either be nothing or an extract_vector_elt to catch
6255 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6256 N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
6257 N0.getOperand(0).getOpcode() == ISD::LOAD &&
6258 N0.getOperand(0).getResNo() == 0) ||
6259 (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
6260 LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
6261 N0 : N0.getOperand(0) );
6263 // Get the constant (if applicable) the zero'th operand is being ANDed with.
6264 // This can be a pure constant or a vector splat, in which case we treat the
6265 // vector as a scalar and use the splat value.
6266 APInt Constant = APInt::getZero(1);
6267 if (const ConstantSDNode *C = isConstOrConstSplat(
6268 N1, /*AllowUndef=*/false, /*AllowTruncation=*/true)) {
6269 Constant = C->getAPIntValue();
6270 } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
6271 APInt SplatValue, SplatUndef;
6272 unsigned SplatBitSize;
6274 bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
6275 SplatBitSize, HasAnyUndefs);
6277 // Undef bits can contribute to a possible optimisation if set, so
6279 SplatValue |= SplatUndef;
6281 // The splat value may be something like "0x00FFFFFF", which means 0 for
6282 // the first vector value and FF for the rest, repeating. We need a mask
6283 // that will apply equally to all members of the vector, so AND all the
6284 // lanes of the constant together.
6285 unsigned EltBitWidth = Vector->getValueType(0).getScalarSizeInBits();
6287 // If the splat value has been compressed to a bitlength lower
6288 // than the size of the vector lane, we need to re-expand it to
6290 if (EltBitWidth > SplatBitSize)
6291 for (SplatValue = SplatValue.zextOrTrunc(EltBitWidth);
6292 SplatBitSize < EltBitWidth; SplatBitSize = SplatBitSize * 2)
6293 SplatValue |= SplatValue.shl(SplatBitSize);
6295 // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
6296 // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
6297 if ((SplatBitSize % EltBitWidth) == 0) {
6298 Constant = APInt::getAllOnes(EltBitWidth);
6299 for (unsigned i = 0, n = (SplatBitSize / EltBitWidth); i < n; ++i)
6300 Constant &= SplatValue.extractBits(EltBitWidth, i * EltBitWidth);
6305 // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
6306 // actually legal and isn't going to get expanded, else this is a false
6308 bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
6309 Load->getValueType(0),
6310 Load->getMemoryVT());
6312 // Resize the constant to the same size as the original memory access before
6313 // extension. If it is still the AllOnesValue then this AND is completely
6315 Constant = Constant.zextOrTrunc(Load->getMemoryVT().getScalarSizeInBits());
6318 switch (Load->getExtensionType()) {
6319 default: B = false; break;
6320 case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
6322 case ISD::NON_EXTLOAD: B = true; break;
6325 if (B && Constant.isAllOnes()) {
6326 // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
6327 // preserve semantics once we get rid of the AND.
6328 SDValue NewLoad(Load, 0);
6330 // Fold the AND away. NewLoad may get replaced immediately.
6331 CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
6333 if (Load->getExtensionType() == ISD::EXTLOAD) {
6334 NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
6335 Load->getValueType(0), SDLoc(Load),
6336 Load->getChain(), Load->getBasePtr(),
6337 Load->getOffset(), Load->getMemoryVT(),
6338 Load->getMemOperand());
6339 // Replace uses of the EXTLOAD with the new ZEXTLOAD.
6340 if (Load->getNumValues() == 3) {
6341 // PRE/POST_INC loads have 3 values.
6342 SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
6343 NewLoad.getValue(2) };
6344 CombineTo(Load, To, 3, true);
6346 CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
6350 return SDValue(N, 0); // Return N so it doesn't get rechecked!
6354 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && N0.hasOneUse() && N1C &&
6355 ISD::isExtOpcode(N0.getOperand(0).getOpcode())) {
6356 SDValue Ext = N0.getOperand(0);
6357 EVT ExtVT = Ext->getValueType(0);
6358 SDValue Extendee = Ext->getOperand(0);
6360 unsigned ScalarWidth = Extendee.getValueType().getScalarSizeInBits();
6361 if (N1C->getAPIntValue().isMask(ScalarWidth)) {
6362 // (and (extract_subvector (zext|anyext|sext v) _) iN_mask)
6363 // => (extract_subvector (iN_zeroext v))
6364 SDValue ZeroExtExtendee =
6365 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), ExtVT, Extendee);
6367 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, ZeroExtExtendee,
6372 // fold (and (masked_gather x)) -> (zext_masked_gather x)
6373 if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
6374 EVT MemVT = GN0->getMemoryVT();
6375 EVT ScalarVT = MemVT.getScalarType();
6377 if (SDValue(GN0, 0).hasOneUse() &&
6378 isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) &&
6379 TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
6380 SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
6381 GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
6383 SDValue ZExtLoad = DAG.getMaskedGather(
6384 DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops,
6385 GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD);
6387 CombineTo(N, ZExtLoad);
6388 AddToWorklist(ZExtLoad.getNode());
6389 // Avoid recheck of N.
6390 return SDValue(N, 0);
6394 // fold (and (load x), 255) -> (zextload x, i8)
6395 // fold (and (extload x, i16), 255) -> (zextload x, i8)
6396 if (N1C && N0.getOpcode() == ISD::LOAD && !VT.isVector())
6397 if (SDValue Res = reduceLoadWidth(N))
6401 // Attempt to propagate the AND back up to the leaves which, if they're
6402 // loads, can be combined to narrow loads and the AND node can be removed.
6403 // Perform after legalization so that extend nodes will already be
6404 // combined into the loads.
6405 if (BackwardsPropagateMask(N))
6406 return SDValue(N, 0);
6409 if (SDValue Combined = visitANDLike(N0, N1, N))
6412 // Simplify: (and (op x...), (op y...)) -> (op (and x, y))
6413 if (N0.getOpcode() == N1.getOpcode())
6414 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
6417 if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
6419 if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
6422 // Masking the negated extension of a boolean is just the zero-extended
6424 // and (sub 0, zext(bool X)), 1 --> zext(bool X)
6425 // and (sub 0, sext(bool X)), 1 --> zext(bool X)
6427 // Note: the SimplifyDemandedBits fold below can make an information-losing
6428 // transform, and then we have no way to find this better fold.
6429 if (N1C && N1C->isOne() && N0.getOpcode() == ISD::SUB) {
6430 if (isNullOrNullSplat(N0.getOperand(0))) {
6431 SDValue SubRHS = N0.getOperand(1);
6432 if (SubRHS.getOpcode() == ISD::ZERO_EXTEND &&
6433 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
6435 if (SubRHS.getOpcode() == ISD::SIGN_EXTEND &&
6436 SubRHS.getOperand(0).getScalarValueSizeInBits() == 1)
6437 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, SubRHS.getOperand(0));
6441 // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
6442 // fold (and (sra)) -> (and (srl)) when possible.
6443 if (SimplifyDemandedBits(SDValue(N, 0)))
6444 return SDValue(N, 0);
6446 // fold (zext_inreg (extload x)) -> (zextload x)
6447 // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
6448 if (ISD::isUNINDEXEDLoad(N0.getNode()) &&
6449 (ISD::isEXTLoad(N0.getNode()) ||
6450 (ISD::isSEXTLoad(N0.getNode()) && N0.hasOneUse()))) {
6451 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
6452 EVT MemVT = LN0->getMemoryVT();
6453 // If we zero all the possible extended bits, then we can turn this into
6454 // a zextload if we are running before legalize or the operation is legal.
6455 unsigned ExtBitSize = N1.getScalarValueSizeInBits();
6456 unsigned MemBitSize = MemVT.getScalarSizeInBits();
6457 APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
6458 if (DAG.MaskedValueIsZero(N1, ExtBits) &&
6459 ((!LegalOperations && LN0->isSimple()) ||
6460 TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
6462 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
6463 LN0->getBasePtr(), MemVT, LN0->getMemOperand());
6465 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
6466 return SDValue(N, 0); // Return N so it doesn't get rechecked!
6470 // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
6471 if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
6472 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
6473 N0.getOperand(1), false))
6477 if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
6480 if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
6483 // Recognize the following pattern:
6485 // AndVT = (and (sign_extend NarrowVT to AndVT) #bitmask)
6487 // where bitmask is a mask that clears the upper bits of AndVT. The
6488 // number of bits in bitmask must be a power of two.
6489 auto IsAndZeroExtMask = [](SDValue LHS, SDValue RHS) {
6490 if (LHS->getOpcode() != ISD::SIGN_EXTEND)
6493 auto *C = dyn_cast<ConstantSDNode>(RHS);
6497 if (!C->getAPIntValue().isMask(
6498 LHS.getOperand(0).getValueType().getFixedSizeInBits()))
6504 // Replace (and (sign_extend ...) #bitmask) with (zero_extend ...).
6505 if (IsAndZeroExtMask(N0, N1))
6506 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0.getOperand(0));
6508 if (hasOperation(ISD::USUBSAT, VT))
6509 if (SDValue V = foldAndToUsubsat(N, DAG))
6515 /// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
6516 SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
6517 bool DemandHighBits) {
6518 if (!LegalOperations)
6521 EVT VT = N->getValueType(0);
6522 if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
6524 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
6527 // Recognize (and (shl a, 8), 0xff00), (and (srl a, 8), 0xff)
6528 bool LookPassAnd0 = false;
6529 bool LookPassAnd1 = false;
6530 if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
6532 if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
6534 if (N0.getOpcode() == ISD::AND) {
6535 if (!N0->hasOneUse())
6537 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6538 // Also handle 0xffff since the LHS is guaranteed to have zeros there.
6539 // This is needed for X86.
6540 if (!N01C || (N01C->getZExtValue() != 0xFF00 &&
6541 N01C->getZExtValue() != 0xFFFF))
6543 N0 = N0.getOperand(0);
6544 LookPassAnd0 = true;
6547 if (N1.getOpcode() == ISD::AND) {
6548 if (!N1->hasOneUse())
6550 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
6551 if (!N11C || N11C->getZExtValue() != 0xFF)
6553 N1 = N1.getOperand(0);
6554 LookPassAnd1 = true;
6557 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
6559 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
6561 if (!N0->hasOneUse() || !N1->hasOneUse())
6564 ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6565 ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
6568 if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8)
6571 // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
6572 SDValue N00 = N0->getOperand(0);
6573 if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
6574 if (!N00->hasOneUse())
6576 ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
6577 if (!N001C || N001C->getZExtValue() != 0xFF)
6579 N00 = N00.getOperand(0);
6580 LookPassAnd0 = true;
6583 SDValue N10 = N1->getOperand(0);
6584 if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
6585 if (!N10->hasOneUse())
6587 ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
6588 // Also allow 0xFFFF since the bits will be shifted out. This is needed
6590 if (!N101C || (N101C->getZExtValue() != 0xFF00 &&
6591 N101C->getZExtValue() != 0xFFFF))
6593 N10 = N10.getOperand(0);
6594 LookPassAnd1 = true;
6600 // Make sure everything beyond the low halfword gets set to zero since the SRL
6601 // 16 will clear the top bits.
6602 unsigned OpSizeInBits = VT.getSizeInBits();
6603 if (OpSizeInBits > 16) {
6604 // If the left-shift isn't masked out then the only way this is a bswap is
6605 // if all bits beyond the low 8 are 0. In that case the entire pattern
6606 // reduces to a left shift anyway: leave it for other parts of the combiner.
6607 if (DemandHighBits && !LookPassAnd0)
6610 // However, if the right shift isn't masked out then it might be because
6611 // it's not needed. See if we can spot that too. If the high bits aren't
6612 // demanded, we only need bits 23:16 to be zero. Otherwise, we need all
6613 // upper bits to be zero.
6614 if (!LookPassAnd1) {
6615 unsigned HighBit = DemandHighBits ? OpSizeInBits : 24;
6616 if (!DAG.MaskedValueIsZero(N10,
6617 APInt::getBitsSet(OpSizeInBits, 16, HighBit)))
6622 SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
6623 if (OpSizeInBits > 16) {
6625 Res = DAG.getNode(ISD::SRL, DL, VT, Res,
6626 DAG.getConstant(OpSizeInBits - 16, DL,
6627 getShiftAmountTy(VT)));
6632 /// Return true if the specified node is an element that makes up a 32-bit
6633 /// packed halfword byteswap.
6634 /// ((x & 0x000000ff) << 8) |
6635 /// ((x & 0x0000ff00) >> 8) |
6636 /// ((x & 0x00ff0000) << 8) |
6637 /// ((x & 0xff000000) >> 8)
6638 static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
6639 if (!N->hasOneUse())
6642 unsigned Opc = N.getOpcode();
6643 if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
6646 SDValue N0 = N.getOperand(0);
6647 unsigned Opc0 = N0.getOpcode();
6648 if (Opc0 != ISD::AND && Opc0 != ISD::SHL && Opc0 != ISD::SRL)
6651 ConstantSDNode *N1C = nullptr;
6652 // SHL or SRL: look upstream for AND mask operand
6653 if (Opc == ISD::AND)
6654 N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
6655 else if (Opc0 == ISD::AND)
6656 N1C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6660 unsigned MaskByteOffset;
6661 switch (N1C->getZExtValue()) {
6664 case 0xFF: MaskByteOffset = 0; break;
6665 case 0xFF00: MaskByteOffset = 1; break;
6667 // In case demanded bits didn't clear the bits that will be shifted out.
6668 // This is needed for X86.
6669 if (Opc == ISD::SRL || (Opc == ISD::AND && Opc0 == ISD::SHL)) {
6674 case 0xFF0000: MaskByteOffset = 2; break;
6675 case 0xFF000000: MaskByteOffset = 3; break;
6678 // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
6679 if (Opc == ISD::AND) {
6680 if (MaskByteOffset == 0 || MaskByteOffset == 2) {
6682 // (x >> 8) & 0xff0000
6683 if (Opc0 != ISD::SRL)
6685 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6686 if (!C || C->getZExtValue() != 8)
6689 // (x << 8) & 0xff00
6690 // (x << 8) & 0xff000000
6691 if (Opc0 != ISD::SHL)
6693 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
6694 if (!C || C->getZExtValue() != 8)
6697 } else if (Opc == ISD::SHL) {
6699 // (x & 0xff0000) << 8
6700 if (MaskByteOffset != 0 && MaskByteOffset != 2)
6702 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
6703 if (!C || C->getZExtValue() != 8)
6705 } else { // Opc == ISD::SRL
6706 // (x & 0xff00) >> 8
6707 // (x & 0xff000000) >> 8
6708 if (MaskByteOffset != 1 && MaskByteOffset != 3)
6710 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
6711 if (!C || C->getZExtValue() != 8)
6715 if (Parts[MaskByteOffset])
6718 Parts[MaskByteOffset] = N0.getOperand(0).getNode();
6722 // Match 2 elements of a packed halfword bswap.
6723 static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
6724 if (N.getOpcode() == ISD::OR)
6725 return isBSwapHWordElement(N.getOperand(0), Parts) &&
6726 isBSwapHWordElement(N.getOperand(1), Parts);
6728 if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) {
6729 ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1));
6730 if (!C || C->getAPIntValue() != 16)
6732 Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode();
6739 // Match this pattern:
6740 // (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff))
6741 // And rewrite this to:
6742 // (rotr (bswap A), 16)
6743 static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
6744 SelectionDAG &DAG, SDNode *N, SDValue N0,
6745 SDValue N1, EVT VT, EVT ShiftAmountTy) {
6746 assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&
6747 "MatchBSwapHWordOrAndAnd: expecting i32");
6748 if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
6750 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
6752 // TODO: this is too restrictive; lifting this restriction requires more tests
6753 if (!N0->hasOneUse() || !N1->hasOneUse())
6755 ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));
6756 ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));
6757 if (!Mask0 || !Mask1)
6759 if (Mask0->getAPIntValue() != 0xff00ff00 ||
6760 Mask1->getAPIntValue() != 0x00ff00ff)
6762 SDValue Shift0 = N0.getOperand(0);
6763 SDValue Shift1 = N1.getOperand(0);
6764 if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL)
6766 ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1));
6767 ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1));
6768 if (!ShiftAmt0 || !ShiftAmt1)
6770 if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8)
6772 if (Shift0.getOperand(0) != Shift1.getOperand(0))
6776 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));
6777 SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);
6778 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
6781 /// Match a 32-bit packed halfword bswap. That is
6782 /// ((x & 0x000000ff) << 8) |
6783 /// ((x & 0x0000ff00) >> 8) |
6784 /// ((x & 0x00ff0000) << 8) |
6785 /// ((x & 0xff000000) >> 8)
6786 /// => (rotl (bswap x), 16)
6787 SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
6788 if (!LegalOperations)
6791 EVT VT = N->getValueType(0);
6794 if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
6797 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,
6798 getShiftAmountTy(VT)))
6801 // Try again with commuted operands.
6802 if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,
6803 getShiftAmountTy(VT)))
6808 // (or (bswaphpair), (bswaphpair))
6809 // (or (or (bswaphpair), (and)), (and))
6810 // (or (or (and), (bswaphpair)), (and))
6811 SDNode *Parts[4] = {};
6813 if (isBSwapHWordPair(N0, Parts)) {
6814 // (or (or (and), (and)), (or (and), (and)))
6815 if (!isBSwapHWordPair(N1, Parts))
6817 } else if (N0.getOpcode() == ISD::OR) {
6818 // (or (or (or (and), (and)), (and)), (and))
6819 if (!isBSwapHWordElement(N1, Parts))
6821 SDValue N00 = N0.getOperand(0);
6822 SDValue N01 = N0.getOperand(1);
6823 if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
6824 !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
6830 // Make sure the parts are all coming from the same node.
6831 if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
6835 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
6836 SDValue(Parts[0], 0));
6838 // Result of the bswap should be rotated by 16. If it's not legal, then
6839 // do (x << 16) | (x >> 16).
6840 SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
6841 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
6842 return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
6843 if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
6844 return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
6845 return DAG.getNode(ISD::OR, DL, VT,
6846 DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
6847 DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
6850 /// This contains all DAGCombine rules which reduce two values combined by
6851 /// an Or operation to a single value \see visitANDLike().
6852 SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
6853 EVT VT = N1.getValueType();
6856 // fold (or x, undef) -> -1
6857 if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
6858 return DAG.getAllOnesConstant(DL, VT);
6860 if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
6863 // (or (and X, C1), (and Y, C2)) -> (and (or X, Y), C3) if possible.
6864 if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
6865 // Don't increase # computations.
6866 (N0->hasOneUse() || N1->hasOneUse())) {
6867 // We can only do this xform if we know that bits from X that are set in C2
6868 // but not in C1 are already zero. Likewise for Y.
6869 if (const ConstantSDNode *N0O1C =
6870 getAsNonOpaqueConstant(N0.getOperand(1))) {
6871 if (const ConstantSDNode *N1O1C =
6872 getAsNonOpaqueConstant(N1.getOperand(1))) {
6873 // We can only do this xform if we know that bits from X that are set in
6874 // C2 but not in C1 are already zero. Likewise for Y.
6875 const APInt &LHSMask = N0O1C->getAPIntValue();
6876 const APInt &RHSMask = N1O1C->getAPIntValue();
6878 if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
6879 DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
6880 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
6881 N0.getOperand(0), N1.getOperand(0));
6882 return DAG.getNode(ISD::AND, DL, VT, X,
6883 DAG.getConstant(LHSMask | RHSMask, DL, VT));
6889 // (or (and X, M), (and X, N)) -> (and X, (or M, N))
6890 if (N0.getOpcode() == ISD::AND &&
6891 N1.getOpcode() == ISD::AND &&
6892 N0.getOperand(0) == N1.getOperand(0) &&
6893 // Don't increase # computations.
6894 (N0->hasOneUse() || N1->hasOneUse())) {
6895 SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
6896 N0.getOperand(1), N1.getOperand(1));
6897 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
6903 /// OR combines for which the commuted variant will be tried as well.
6904 static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
6906 EVT VT = N0.getValueType();
6907 if (N0.getOpcode() == ISD::AND) {
6908 SDValue N00 = N0.getOperand(0);
6909 SDValue N01 = N0.getOperand(1);
6911 // fold (or (and X, (xor Y, -1)), Y) -> (or X, Y)
6912 // TODO: Set AllowUndefs = true.
6913 if (getBitwiseNotOperand(N01, N00,
6914 /* AllowUndefs */ false) == N1)
6915 return DAG.getNode(ISD::OR, SDLoc(N), VT, N00, N1);
6917 // fold (or (and (xor Y, -1), X), Y) -> (or X, Y)
6918 if (getBitwiseNotOperand(N00, N01,
6919 /* AllowUndefs */ false) == N1)
6920 return DAG.getNode(ISD::OR, SDLoc(N), VT, N01, N1);
6923 if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
6926 auto peekThroughZext = [](SDValue V) {
6927 if (V->getOpcode() == ISD::ZERO_EXTEND)
6928 return V->getOperand(0);
6932 // (fshl X, ?, Y) | (shl X, Y) --> fshl X, ?, Y
6933 if (N0.getOpcode() == ISD::FSHL && N1.getOpcode() == ISD::SHL &&
6934 N0.getOperand(0) == N1.getOperand(0) &&
6935 peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
6938 // (fshr ?, X, Y) | (srl X, Y) --> fshr ?, X, Y
6939 if (N0.getOpcode() == ISD::FSHR && N1.getOpcode() == ISD::SRL &&
6940 N0.getOperand(1) == N1.getOperand(0) &&
6941 peekThroughZext(N0.getOperand(2)) == peekThroughZext(N1.getOperand(1)))
6947 SDValue DAGCombiner::visitOR(SDNode *N) {
6948 SDValue N0 = N->getOperand(0);
6949 SDValue N1 = N->getOperand(1);
6950 EVT VT = N1.getValueType();
6956 // fold (or c1, c2) -> c1|c2
6957 if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
6960 // canonicalize constant to RHS
6961 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
6962 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
6963 return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
6966 if (VT.isVector()) {
6967 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
6970 // fold (or x, 0) -> x, vector edition
6971 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
6974 // fold (or x, -1) -> -1, vector edition
6975 if (ISD::isConstantSplatVectorAllOnes(N1.getNode()))
6976 // do not return N1, because undef node may exist in N1
6977 return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
6979 // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
6980 // Do this only if the resulting type / shuffle is legal.
6981 auto *SV0 = dyn_cast<ShuffleVectorSDNode>(N0);
6982 auto *SV1 = dyn_cast<ShuffleVectorSDNode>(N1);
6983 if (SV0 && SV1 && TLI.isTypeLegal(VT)) {
6984 bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
6985 bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
6986 bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
6987 bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
6988 // Ensure both shuffles have a zero input.
6989 if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
6990 assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
6991 assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
6992 bool CanFold = true;
6993 int NumElts = VT.getVectorNumElements();
6994 SmallVector<int, 4> Mask(NumElts, -1);
6996 for (int i = 0; i != NumElts; ++i) {
6997 int M0 = SV0->getMaskElt(i);
6998 int M1 = SV1->getMaskElt(i);
7000 // Determine if either index is pointing to a zero vector.
7001 bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts));
7002 bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts));
7004 // If one element is zero and the otherside is undef, keep undef.
7005 // This also handles the case that both are undef.
7006 if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0))
7009 // Make sure only one of the elements is zero.
7010 if (M0Zero == M1Zero) {
7015 assert((M0 >= 0 || M1 >= 0) && "Undef index!");
7017 // We have a zero and non-zero element. If the non-zero came from
7018 // SV0 make the index a LHS index. If it came from SV1, make it
7019 // a RHS index. We need to mod by NumElts because we don't care
7020 // which operand it came from in the original shuffles.
7021 Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
7025 SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
7026 SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
7028 SDValue LegalShuffle =
7029 TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS,
7032 return LegalShuffle;
7038 // fold (or x, 0) -> x
7039 if (isNullConstant(N1))
7042 // fold (or x, -1) -> -1
7043 if (isAllOnesConstant(N1))
7046 if (SDValue NewSel = foldBinOpIntoSelect(N))
7049 // fold (or x, c) -> c iff (x & ~c) == 0
7050 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
7051 if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
7054 if (SDValue Combined = visitORLike(N0, N1, N))
7057 if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
7060 // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
7061 if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
7063 if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
7067 if (SDValue ROR = reassociateOps(ISD::OR, SDLoc(N), N0, N1, N->getFlags()))
7070 // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
7071 // iff (c1 & c2) != 0 or c1/c2 are undef.
7072 auto MatchIntersect = [](ConstantSDNode *C1, ConstantSDNode *C2) {
7073 return !C1 || !C2 || C1->getAPIntValue().intersects(C2->getAPIntValue());
7075 if (N0.getOpcode() == ISD::AND && N0->hasOneUse() &&
7076 ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
7077 if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
7078 {N1, N0.getOperand(1)})) {
7079 SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
7080 AddToWorklist(IOR.getNode());
7081 return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
7085 if (SDValue Combined = visitORCommutative(DAG, N0, N1, N))
7087 if (SDValue Combined = visitORCommutative(DAG, N1, N0, N))
7090 // Simplify: (or (op x...), (op y...)) -> (op (or x, y))
7091 if (N0.getOpcode() == N1.getOpcode())
7092 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
7095 // See if this is some rotate idiom.
7096 if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N)))
7099 if (SDValue Load = MatchLoadCombine(N))
7102 // Simplify the operands using demanded-bits information.
7103 if (SimplifyDemandedBits(SDValue(N, 0)))
7104 return SDValue(N, 0);
7106 // If OR can be rewritten into ADD, try combines based on ADD.
7107 if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
7108 DAG.haveNoCommonBitsSet(N0, N1))
7109 if (SDValue Combined = visitADDLike(N))
7115 static SDValue stripConstantMask(SelectionDAG &DAG, SDValue Op, SDValue &Mask) {
7116 if (Op.getOpcode() == ISD::AND &&
7117 DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
7118 Mask = Op.getOperand(1);
7119 return Op.getOperand(0);
7124 /// Match "(X shl/srl V1) & V2" where V2 may not be present.
7125 static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
7127 Op = stripConstantMask(DAG, Op, Mask);
7128 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
7135 /// Helper function for visitOR to extract the needed side of a rotate idiom
7136 /// from a shl/srl/mul/udiv. This is meant to handle cases where
7137 /// InstCombine merged some outside op with one of the shifts from
7138 /// the rotate pattern.
7139 /// \returns An empty \c SDValue if the needed shift couldn't be extracted.
7140 /// Otherwise, returns an expansion of \p ExtractFrom based on the following
7143 /// (or (add v v) (shrl v bitwidth-1)):
7144 /// expands (add v v) -> (shl v 1)
7146 /// (or (mul v c0) (shrl (mul v c1) c2)):
7147 /// expands (mul v c0) -> (shl (mul v c1) c3)
7149 /// (or (udiv v c0) (shl (udiv v c1) c2)):
7150 /// expands (udiv v c0) -> (shrl (udiv v c1) c3)
7152 /// (or (shl v c0) (shrl (shl v c1) c2)):
7153 /// expands (shl v c0) -> (shl (shl v c1) c3)
7155 /// (or (shrl v c0) (shl (shrl v c1) c2)):
7156 /// expands (shrl v c0) -> (shrl (shrl v c1) c3)
7158 /// Such that in all cases, c3+c2==bitwidth(op v c1).
7159 static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
7160 SDValue ExtractFrom, SDValue &Mask,
7162 assert(OppShift && ExtractFrom && "Empty SDValue");
7164 (OppShift.getOpcode() == ISD::SHL || OppShift.getOpcode() == ISD::SRL) &&
7165 "Existing shift must be valid as a rotate half");
7167 ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
7169 // Value and Type of the shift.
7170 SDValue OppShiftLHS = OppShift.getOperand(0);
7171 EVT ShiftedVT = OppShiftLHS.getValueType();
7173 // Amount of the existing shift.
7174 ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
7176 // (add v v) -> (shl v 1)
7177 // TODO: Should this be a general DAG canonicalization?
7178 if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
7179 ExtractFrom.getOpcode() == ISD::ADD &&
7180 ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
7181 ExtractFrom.getOperand(0) == OppShiftLHS &&
7182 OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1)
7183 return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS,
7184 DAG.getShiftAmountConstant(1, ShiftedVT, DL));
7187 // (or (op0 v c0) (shiftl/r (op0 v c1) c2))
7189 // Find opcode of the needed shift to be extracted from (op0 v c0).
7190 unsigned Opcode = ISD::DELETED_NODE;
7191 bool IsMulOrDiv = false;
7192 // Set Opcode and IsMulOrDiv if the extract opcode matches the needed shift
7193 // opcode or its arithmetic (mul or udiv) variant.
7194 auto SelectOpcode = [&](unsigned NeededShift, unsigned MulOrDivVariant) {
7195 IsMulOrDiv = ExtractFrom.getOpcode() == MulOrDivVariant;
7196 if (!IsMulOrDiv && ExtractFrom.getOpcode() != NeededShift)
7198 Opcode = NeededShift;
7201 // op0 must be either the needed shift opcode or the mul/udiv equivalent
7202 // that the needed shift can be extracted from.
7203 if ((OppShift.getOpcode() != ISD::SRL || !SelectOpcode(ISD::SHL, ISD::MUL)) &&
7204 (OppShift.getOpcode() != ISD::SHL || !SelectOpcode(ISD::SRL, ISD::UDIV)))
7207 // op0 must be the same opcode on both sides, have the same LHS argument,
7208 // and produce the same value type.
7209 if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
7210 OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
7211 ShiftedVT != ExtractFrom.getValueType())
7214 // Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
7215 ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
7216 // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
7217 ConstantSDNode *ExtractFromCst =
7218 isConstOrConstSplat(ExtractFrom.getOperand(1));
7219 // TODO: We should be able to handle non-uniform constant vectors for these values
7220 // Check that we have constant values.
7221 if (!OppShiftCst || !OppShiftCst->getAPIntValue() ||
7222 !OppLHSCst || !OppLHSCst->getAPIntValue() ||
7223 !ExtractFromCst || !ExtractFromCst->getAPIntValue())
7226 // Compute the shift amount we need to extract to complete the rotate.
7227 const unsigned VTWidth = ShiftedVT.getScalarSizeInBits();
7228 if (OppShiftCst->getAPIntValue().ugt(VTWidth))
7230 APInt NeededShiftAmt = VTWidth - OppShiftCst->getAPIntValue();
7231 // Normalize the bitwidth of the two mul/udiv/shift constant operands.
7232 APInt ExtractFromAmt = ExtractFromCst->getAPIntValue();
7233 APInt OppLHSAmt = OppLHSCst->getAPIntValue();
7234 zeroExtendToMatch(ExtractFromAmt, OppLHSAmt);
7236 // Now try extract the needed shift from the ExtractFrom op and see if the
7237 // result matches up with the existing shift's LHS op.
7239 // Op to extract from is a mul or udiv by a constant.
7241 // c2 / (1 << (bitwidth(op0 v c0) - c1)) == c0
7242 // c2 % (1 << (bitwidth(op0 v c0) - c1)) == 0
7243 const APInt ExtractDiv = APInt::getOneBitSet(ExtractFromAmt.getBitWidth(),
7244 NeededShiftAmt.getZExtValue());
7247 APInt::udivrem(ExtractFromAmt, ExtractDiv, ResultAmt, Rem);
7248 if (Rem != 0 || ResultAmt != OppLHSAmt)
7251 // Op to extract from is a shift by a constant.
7253 // c2 - (bitwidth(op0 v c0) - c1) == c0
7254 if (OppLHSAmt != ExtractFromAmt - NeededShiftAmt.zextOrTrunc(
7255 ExtractFromAmt.getBitWidth()))
7259 // Return the expanded shift op that should allow a rotate to be formed.
7260 EVT ShiftVT = OppShift.getOperand(1).getValueType();
7261 EVT ResVT = ExtractFrom.getValueType();
7262 SDValue NewShiftNode = DAG.getConstant(NeededShiftAmt, DL, ShiftVT);
7263 return DAG.getNode(Opcode, DL, ResVT, OppShiftLHS, NewShiftNode);
7266 // Return true if we can prove that, whenever Neg and Pos are both in the
7267 // range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos). This means that
7268 // for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
7270 // (or (shift1 X, Neg), (shift2 X, Pos))
7272 // reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
7273 // in direction shift1 by Neg. The range [0, EltSize) means that we only need
7274 // to consider shift amounts with defined behavior.
7276 // The IsRotate flag should be set when the LHS of both shifts is the same.
7277 // Otherwise if matching a general funnel shift, it should be clear.
7278 static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
7279 SelectionDAG &DAG, bool IsRotate) {
7280 const auto &TLI = DAG.getTargetLoweringInfo();
7281 // If EltSize is a power of 2 then:
7283 // (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
7284 // (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
7286 // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
7287 // for the stronger condition:
7289 // Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1) [A]
7291 // for all Neg and Pos. Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
7292 // we can just replace Neg with Neg' for the rest of the function.
7294 // In other cases we check for the even stronger condition:
7296 // Neg == EltSize - Pos [B]
7298 // for all Neg and Pos. Note that the (or ...) then invokes undefined
7299 // behavior if Pos == 0 (and consequently Neg == EltSize).
7301 // We could actually use [A] whenever EltSize is a power of 2, but the
7302 // only extra cases that it would match are those uninteresting ones
7303 // where Neg and Pos are never in range at the same time. E.g. for
7304 // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
7305 // as well as (sub 32, Pos), but:
7307 // (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
7309 // always invokes undefined behavior for 32-bit X.
7311 // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
7312 // This allows us to peek through any operations that only affect Mask's
7313 // un-demanded bits.
7315 // NOTE: We can only do this when matching operations which won't modify the
7316 // least Log2(EltSize) significant bits and not a general funnel shift.
7317 unsigned MaskLoBits = 0;
7318 if (IsRotate && isPowerOf2_64(EltSize)) {
7319 unsigned Bits = Log2_64(EltSize);
7320 unsigned NegBits = Neg.getScalarValueSizeInBits();
7321 if (NegBits >= Bits) {
7322 APInt DemandedBits = APInt::getLowBitsSet(NegBits, Bits);
7324 TLI.SimplifyMultipleUseDemandedBits(Neg, DemandedBits, DAG)) {
7331 // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
7332 if (Neg.getOpcode() != ISD::SUB)
7334 ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
7337 SDValue NegOp1 = Neg.getOperand(1);
7339 // On the RHS of [A], if Pos is the result of operation on Pos' that won't
7340 // affect Mask's demanded bits, just replace Pos with Pos'. These operations
7341 // are redundant for the purpose of the equality.
7343 unsigned PosBits = Pos.getScalarValueSizeInBits();
7344 if (PosBits >= MaskLoBits) {
7345 APInt DemandedBits = APInt::getLowBitsSet(PosBits, MaskLoBits);
7347 TLI.SimplifyMultipleUseDemandedBits(Pos, DemandedBits, DAG)) {
7353 // The condition we need is now:
7355 // (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
7357 // If NegOp1 == Pos then we need:
7359 // EltSize & Mask == NegC & Mask
7361 // (because "x & Mask" is a truncation and distributes through subtraction).
7363 // We also need to account for a potential truncation of NegOp1 if the amount
7364 // has already been legalized to a shift amount type.
7366 if ((Pos == NegOp1) ||
7367 (NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0)))
7368 Width = NegC->getAPIntValue();
7370 // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
7371 // Then the condition we want to prove becomes:
7373 // (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
7375 // which, again because "x & Mask" is a truncation, becomes:
7377 // NegC & Mask == (EltSize - PosC) & Mask
7378 // EltSize & Mask == (NegC + PosC) & Mask
7379 else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
7380 if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
7381 Width = PosC->getAPIntValue() + NegC->getAPIntValue();
7387 // Now we just need to check that EltSize & Mask == Width & Mask.
7389 // EltSize & Mask is 0 since Mask is EltSize - 1.
7390 return Width.getLoBits(MaskLoBits) == 0;
7391 return Width == EltSize;
7394 // A subroutine of MatchRotate used once we have found an OR of two opposite
7395 // shifts of Shifted. If Neg == <operand size> - Pos then the OR reduces
7396 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
7397 // former being preferred if supported. InnerPos and InnerNeg are Pos and
7398 // Neg with outer conversions stripped away.
7399 SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
7400 SDValue Neg, SDValue InnerPos,
7401 SDValue InnerNeg, bool HasPos,
7402 unsigned PosOpcode, unsigned NegOpcode,
7404 // fold (or (shl x, (*ext y)),
7405 // (srl x, (*ext (sub 32, y)))) ->
7406 // (rotl x, y) or (rotr x, (sub 32, y))
7408 // fold (or (shl x, (*ext (sub 32, y))),
7409 // (srl x, (*ext y))) ->
7410 // (rotr x, y) or (rotl x, (sub 32, y))
7411 EVT VT = Shifted.getValueType();
7412 if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG,
7413 /*IsRotate*/ true)) {
7414 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
7415 HasPos ? Pos : Neg);
7421 // A subroutine of MatchRotate used once we have found an OR of two opposite
7422 // shifts of N0 + N1. If Neg == <operand size> - Pos then the OR reduces
7423 // to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
7424 // former being preferred if supported. InnerPos and InnerNeg are Pos and
7425 // Neg with outer conversions stripped away.
7426 // TODO: Merge with MatchRotatePosNeg.
7427 SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
7428 SDValue Neg, SDValue InnerPos,
7429 SDValue InnerNeg, bool HasPos,
7430 unsigned PosOpcode, unsigned NegOpcode,
7432 EVT VT = N0.getValueType();
7433 unsigned EltBits = VT.getScalarSizeInBits();
7435 // fold (or (shl x0, (*ext y)),
7436 // (srl x1, (*ext (sub 32, y)))) ->
7437 // (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
7439 // fold (or (shl x0, (*ext (sub 32, y))),
7440 // (srl x1, (*ext y))) ->
7441 // (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
7442 if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG, /*IsRotate*/ N0 == N1)) {
7443 return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
7444 HasPos ? Pos : Neg);
7447 // Matching the shift+xor cases, we can't easily use the xor'd shift amount
7448 // so for now just use the PosOpcode case if its legal.
7449 // TODO: When can we use the NegOpcode case?
7450 if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
7451 auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {
7452 if (Op.getOpcode() != BinOpc)
7454 ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));
7455 return Cst && (Cst->getAPIntValue() == Imm);
7458 // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
7459 // -> (fshl x0, x1, y)
7460 if (IsBinOpImm(N1, ISD::SRL, 1) &&
7461 IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
7462 InnerPos == InnerNeg.getOperand(0) &&
7463 TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {
7464 return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
7467 // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
7468 // -> (fshr x0, x1, y)
7469 if (IsBinOpImm(N0, ISD::SHL, 1) &&
7470 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
7471 InnerNeg == InnerPos.getOperand(0) &&
7472 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
7473 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
7476 // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
7477 // -> (fshr x0, x1, y)
7478 // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
7479 if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
7480 IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
7481 InnerNeg == InnerPos.getOperand(0) &&
7482 TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
7483 return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
7490 // MatchRotate - Handle an 'or' of two operands. If this is one of the many
7491 // idioms for rotate, and if the target supports rotation instructions, generate
7492 // a rot[lr]. This also matches funnel shift patterns, similar to rotation but
7493 // with different shifted sources.
7494 SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
7495 EVT VT = LHS.getValueType();
7497 // The target must have at least one rotate/funnel flavor.
7498 // We still try to match rotate by constant pre-legalization.
7499 // TODO: Support pre-legalization funnel-shift by constant.
7500 bool HasROTL = hasOperation(ISD::ROTL, VT);
7501 bool HasROTR = hasOperation(ISD::ROTR, VT);
7502 bool HasFSHL = hasOperation(ISD::FSHL, VT);
7503 bool HasFSHR = hasOperation(ISD::FSHR, VT);
7505 // If the type is going to be promoted and the target has enabled custom
7506 // lowering for rotate, allow matching rotate by non-constants. Only allow
7507 // this for scalar types.
7508 if (VT.isScalarInteger() && TLI.getTypeAction(*DAG.getContext(), VT) ==
7509 TargetLowering::TypePromoteInteger) {
7510 HasROTL |= TLI.getOperationAction(ISD::ROTL, VT) == TargetLowering::Custom;
7511 HasROTR |= TLI.getOperationAction(ISD::ROTR, VT) == TargetLowering::Custom;
7514 if (LegalOperations && !HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
7517 // Check for truncated rotate.
7518 if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
7519 LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
7520 assert(LHS.getValueType() == RHS.getValueType());
7521 if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
7522 return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
7526 // Match "(X shl/srl V1) & V2" where V2 may not be present.
7527 SDValue LHSShift; // The shift.
7528 SDValue LHSMask; // AND value if any.
7529 matchRotateHalf(DAG, LHS, LHSShift, LHSMask);
7531 SDValue RHSShift; // The shift.
7532 SDValue RHSMask; // AND value if any.
7533 matchRotateHalf(DAG, RHS, RHSShift, RHSMask);
7535 // If neither side matched a rotate half, bail
7536 if (!LHSShift && !RHSShift)
7539 // InstCombine may have combined a constant shl, srl, mul, or udiv with one
7540 // side of the rotate, so try to handle that here. In all cases we need to
7541 // pass the matched shift from the opposite side to compute the opcode and
7542 // needed shift amount to extract. We still want to do this if both sides
7543 // matched a rotate half because one half may be a potential overshift that
7544 // can be broken down (ie if InstCombine merged two shl or srl ops into a
7547 // Have LHS side of the rotate, try to extract the needed shift from the RHS.
7549 if (SDValue NewRHSShift =
7550 extractShiftForRotate(DAG, LHSShift, RHS, RHSMask, DL))
7551 RHSShift = NewRHSShift;
7552 // Have RHS side of the rotate, try to extract the needed shift from the LHS.
7554 if (SDValue NewLHSShift =
7555 extractShiftForRotate(DAG, RHSShift, LHS, LHSMask, DL))
7556 LHSShift = NewLHSShift;
7558 // If a side is still missing, nothing else we can do.
7559 if (!RHSShift || !LHSShift)
7562 // At this point we've matched or extracted a shift op on each side.
7564 if (LHSShift.getOpcode() == RHSShift.getOpcode())
7565 return SDValue(); // Shifts must disagree.
7567 // Canonicalize shl to left side in a shl/srl pair.
7568 if (RHSShift.getOpcode() == ISD::SHL) {
7569 std::swap(LHS, RHS);
7570 std::swap(LHSShift, RHSShift);
7571 std::swap(LHSMask, RHSMask);
7574 unsigned EltSizeInBits = VT.getScalarSizeInBits();
7575 SDValue LHSShiftArg = LHSShift.getOperand(0);
7576 SDValue LHSShiftAmt = LHSShift.getOperand(1);
7577 SDValue RHSShiftArg = RHSShift.getOperand(0);
7578 SDValue RHSShiftAmt = RHSShift.getOperand(1);
7580 auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
7581 ConstantSDNode *RHS) {
7582 return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
7585 auto ApplyMasks = [&](SDValue Res) {
7586 // If there is an AND of either shifted operand, apply it to the result.
7587 if (LHSMask.getNode() || RHSMask.getNode()) {
7588 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
7589 SDValue Mask = AllOnes;
7591 if (LHSMask.getNode()) {
7592 SDValue RHSBits = DAG.getNode(ISD::SRL, DL, VT, AllOnes, RHSShiftAmt);
7593 Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
7594 DAG.getNode(ISD::OR, DL, VT, LHSMask, RHSBits));
7596 if (RHSMask.getNode()) {
7597 SDValue LHSBits = DAG.getNode(ISD::SHL, DL, VT, AllOnes, LHSShiftAmt);
7598 Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
7599 DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
7602 Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask);
7608 // TODO: Support pre-legalization funnel-shift by constant.
7609 bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
7610 if (!IsRotate && !(HasFSHL || HasFSHR)) {
7611 if (TLI.isTypeLegal(VT) && LHS.hasOneUse() && RHS.hasOneUse() &&
7612 ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
7613 // Look for a disguised rotate by constant.
7614 // The common shifted operand X may be hidden inside another 'or'.
7616 auto matchOr = [&X, &Y](SDValue Or, SDValue CommonOp) {
7617 if (!Or.hasOneUse() || Or.getOpcode() != ISD::OR)
7619 if (CommonOp == Or.getOperand(0)) {
7621 Y = Or.getOperand(1);
7624 if (CommonOp == Or.getOperand(1)) {
7626 Y = Or.getOperand(0);
7633 if (matchOr(LHSShiftArg, RHSShiftArg)) {
7634 // (shl (X | Y), C1) | (srl X, C2) --> (rotl X, C1) | (shl Y, C1)
7635 SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
7636 SDValue ShlY = DAG.getNode(ISD::SHL, DL, VT, Y, LHSShiftAmt);
7637 Res = DAG.getNode(ISD::OR, DL, VT, RotX, ShlY);
7638 } else if (matchOr(RHSShiftArg, LHSShiftArg)) {
7639 // (shl X, C1) | (srl (X | Y), C2) --> (rotl X, C1) | (srl Y, C2)
7640 SDValue RotX = DAG.getNode(ISD::ROTL, DL, VT, X, LHSShiftAmt);
7641 SDValue SrlY = DAG.getNode(ISD::SRL, DL, VT, Y, RHSShiftAmt);
7642 Res = DAG.getNode(ISD::OR, DL, VT, RotX, SrlY);
7647 return ApplyMasks(Res);
7650 return SDValue(); // Requires funnel shift support.
7653 // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
7654 // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
7655 // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
7656 // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
7657 // iff C1+C2 == EltSizeInBits
7658 if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
7660 if (IsRotate && (HasROTL || HasROTR || !(HasFSHL || HasFSHR))) {
7661 bool UseROTL = !LegalOperations || HasROTL;
7662 Res = DAG.getNode(UseROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
7663 UseROTL ? LHSShiftAmt : RHSShiftAmt);
7665 bool UseFSHL = !LegalOperations || HasFSHL;
7666 Res = DAG.getNode(UseFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
7667 RHSShiftArg, UseFSHL ? LHSShiftAmt : RHSShiftAmt);
7670 return ApplyMasks(Res);
7673 // Even pre-legalization, we can't easily rotate/funnel-shift by a variable
7675 if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
7678 // If there is a mask here, and we have a variable shift, we can't be sure
7679 // that we're masking out the right stuff.
7680 if (LHSMask.getNode() || RHSMask.getNode())
7683 // If the shift amount is sign/zext/any-extended just peel it off.
7684 SDValue LExtOp0 = LHSShiftAmt;
7685 SDValue RExtOp0 = RHSShiftAmt;
7686 if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
7687 LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
7688 LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
7689 LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
7690 (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
7691 RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
7692 RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
7693 RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
7694 LExtOp0 = LHSShiftAmt.getOperand(0);
7695 RExtOp0 = RHSShiftAmt.getOperand(0);
7698 if (IsRotate && (HasROTL || HasROTR)) {
7700 MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
7701 RExtOp0, HasROTL, ISD::ROTL, ISD::ROTR, DL);
7706 MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
7707 LExtOp0, HasROTR, ISD::ROTR, ISD::ROTL, DL);
7713 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
7714 LExtOp0, RExtOp0, HasFSHL, ISD::FSHL, ISD::FSHR, DL);
7719 MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
7720 RExtOp0, LExtOp0, HasFSHR, ISD::FSHR, ISD::FSHL, DL);
7729 /// Represents known origin of an individual byte in load combine pattern. The
7730 /// value of the byte is either constant zero or comes from memory.
7731 struct ByteProvider {
7732 // For constant zero providers Load is set to nullptr. For memory providers
7733 // Load represents the node which loads the byte from memory.
7734 // ByteOffset is the offset of the byte in the value produced by the load.
7735 LoadSDNode *Load = nullptr;
7736 unsigned ByteOffset = 0;
7738 ByteProvider() = default;
7740 static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
7741 return ByteProvider(Load, ByteOffset);
7744 static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
7746 bool isConstantZero() const { return !Load; }
7747 bool isMemory() const { return Load; }
7749 bool operator==(const ByteProvider &Other) const {
7750 return Other.Load == Load && Other.ByteOffset == ByteOffset;
7754 ByteProvider(LoadSDNode *Load, unsigned ByteOffset)
7755 : Load(Load), ByteOffset(ByteOffset) {}
7758 } // end anonymous namespace
7760 /// Recursively traverses the expression calculating the origin of the requested
7761 /// byte of the given value. Returns None if the provider can't be calculated.
7763 /// For all the values except the root of the expression verifies that the value
7764 /// has exactly one use and if it's not true return None. This way if the origin
7765 /// of the byte is returned it's guaranteed that the values which contribute to
7766 /// the byte are not used outside of this expression.
7768 /// Because the parts of the expression are not allowed to have more than one
7769 /// use this function iterates over trees, not DAGs. So it never visits the same
7770 /// node more than once.
7771 static const Optional<ByteProvider>
7772 calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
7773 bool Root = false) {
7774 // Typical i64 by i8 pattern requires recursion up to 8 calls depth
7778 if (!Root && !Op.hasOneUse())
7781 assert(Op.getValueType().isScalarInteger() && "can't handle other types");
7782 unsigned BitWidth = Op.getValueSizeInBits();
7783 if (BitWidth % 8 != 0)
7785 unsigned ByteWidth = BitWidth / 8;
7786 assert(Index < ByteWidth && "invalid index requested");
7789 switch (Op.getOpcode()) {
7791 auto LHS = calculateByteProvider(Op->getOperand(0), Index, Depth + 1);
7794 auto RHS = calculateByteProvider(Op->getOperand(1), Index, Depth + 1);
7798 if (LHS->isConstantZero())
7800 if (RHS->isConstantZero())
7805 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
7809 uint64_t BitShift = ShiftOp->getZExtValue();
7810 if (BitShift % 8 != 0)
7812 uint64_t ByteShift = BitShift / 8;
7814 return Index < ByteShift
7815 ? ByteProvider::getConstantZero()
7816 : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
7819 case ISD::ANY_EXTEND:
7820 case ISD::SIGN_EXTEND:
7821 case ISD::ZERO_EXTEND: {
7822 SDValue NarrowOp = Op->getOperand(0);
7823 unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
7824 if (NarrowBitWidth % 8 != 0)
7826 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
7828 if (Index >= NarrowByteWidth)
7829 return Op.getOpcode() == ISD::ZERO_EXTEND
7830 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
7832 return calculateByteProvider(NarrowOp, Index, Depth + 1);
7835 return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
7838 auto L = cast<LoadSDNode>(Op.getNode());
7839 if (!L->isSimple() || L->isIndexed())
7842 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
7843 if (NarrowBitWidth % 8 != 0)
7845 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
7847 if (Index >= NarrowByteWidth)
7848 return L->getExtensionType() == ISD::ZEXTLOAD
7849 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
7851 return ByteProvider::getMemory(L, Index);
7858 static unsigned littleEndianByteAt(unsigned BW, unsigned i) {
7862 static unsigned bigEndianByteAt(unsigned BW, unsigned i) {
7866 // Check if the bytes offsets we are looking at match with either big or
7867 // little endian value loaded. Return true for big endian, false for little
7868 // endian, and None if match failed.
7869 static Optional<bool> isBigEndian(const ArrayRef<int64_t> ByteOffsets,
7870 int64_t FirstOffset) {
7871 // The endian can be decided only when it is 2 bytes at least.
7872 unsigned Width = ByteOffsets.size();
7876 bool BigEndian = true, LittleEndian = true;
7877 for (unsigned i = 0; i < Width; i++) {
7878 int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
7879 LittleEndian &= CurrentByteOffset == littleEndianByteAt(Width, i);
7880 BigEndian &= CurrentByteOffset == bigEndianByteAt(Width, i);
7881 if (!BigEndian && !LittleEndian)
7885 assert((BigEndian != LittleEndian) && "It should be either big endian or"
7890 static SDValue stripTruncAndExt(SDValue Value) {
7891 switch (Value.getOpcode()) {
7893 case ISD::ZERO_EXTEND:
7894 case ISD::SIGN_EXTEND:
7895 case ISD::ANY_EXTEND:
7896 return stripTruncAndExt(Value.getOperand(0));
7901 /// Match a pattern where a wide type scalar value is stored by several narrow
7902 /// stores. Fold it into a single store or a BSWAP and a store if the targets
7905 /// Assuming little endian target:
7908 /// p[0] = (val >> 0) & 0xFF;
7909 /// p[1] = (val >> 8) & 0xFF;
7910 /// p[2] = (val >> 16) & 0xFF;
7911 /// p[3] = (val >> 24) & 0xFF;
7913 /// *((i32)p) = val;
7917 /// p[0] = (val >> 24) & 0xFF;
7918 /// p[1] = (val >> 16) & 0xFF;
7919 /// p[2] = (val >> 8) & 0xFF;
7920 /// p[3] = (val >> 0) & 0xFF;
7922 /// *((i32)p) = BSWAP(val);
7923 SDValue DAGCombiner::mergeTruncStores(StoreSDNode *N) {
7924 // The matching looks for "store (trunc x)" patterns that appear early but are
7925 // likely to be replaced by truncating store nodes during combining.
7926 // TODO: If there is evidence that running this later would help, this
7927 // limitation could be removed. Legality checks may need to be added
7928 // for the created store and optional bswap/rotate.
7929 if (LegalOperations || OptLevel == CodeGenOpt::None)
7932 // We only handle merging simple stores of 1-4 bytes.
7933 // TODO: Allow unordered atomics when wider type is legal (see D66309)
7934 EVT MemVT = N->getMemoryVT();
7935 if (!(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) ||
7936 !N->isSimple() || N->isIndexed())
7939 // Collect all of the stores in the chain.
7940 SDValue Chain = N->getChain();
7941 SmallVector<StoreSDNode *, 8> Stores = {N};
7942 while (auto *Store = dyn_cast<StoreSDNode>(Chain)) {
7943 // All stores must be the same size to ensure that we are writing all of the
7944 // bytes in the wide value.
7945 // TODO: We could allow multiple sizes by tracking each stored byte.
7946 if (Store->getMemoryVT() != MemVT || !Store->isSimple() ||
7949 Stores.push_back(Store);
7950 Chain = Store->getChain();
7952 // There is no reason to continue if we do not have at least a pair of stores.
7953 if (Stores.size() < 2)
7956 // Handle simple types only.
7957 LLVMContext &Context = *DAG.getContext();
7958 unsigned NumStores = Stores.size();
7959 unsigned NarrowNumBits = N->getMemoryVT().getScalarSizeInBits();
7960 unsigned WideNumBits = NumStores * NarrowNumBits;
7961 EVT WideVT = EVT::getIntegerVT(Context, WideNumBits);
7962 if (WideVT != MVT::i16 && WideVT != MVT::i32 && WideVT != MVT::i64)
7965 // Check if all bytes of the source value that we are looking at are stored
7966 // to the same base address. Collect offsets from Base address into OffsetMap.
7967 SDValue SourceValue;
7968 SmallVector<int64_t, 8> OffsetMap(NumStores, INT64_MAX);
7969 int64_t FirstOffset = INT64_MAX;
7970 StoreSDNode *FirstStore = nullptr;
7971 Optional<BaseIndexOffset> Base;
7972 for (auto *Store : Stores) {
7973 // All the stores store different parts of the CombinedValue. A truncate is
7974 // required to get the partial value.
7975 SDValue Trunc = Store->getValue();
7976 if (Trunc.getOpcode() != ISD::TRUNCATE)
7978 // Other than the first/last part, a shift operation is required to get the
7981 SDValue WideVal = Trunc.getOperand(0);
7982 if ((WideVal.getOpcode() == ISD::SRL || WideVal.getOpcode() == ISD::SRA) &&
7983 isa<ConstantSDNode>(WideVal.getOperand(1))) {
7984 // The shift amount must be a constant multiple of the narrow type.
7985 // It is translated to the offset address in the wide source value "y".
7987 // x = srl y, ShiftAmtC
7990 uint64_t ShiftAmtC = WideVal.getConstantOperandVal(1);
7991 if (ShiftAmtC % NarrowNumBits != 0)
7994 Offset = ShiftAmtC / NarrowNumBits;
7995 WideVal = WideVal.getOperand(0);
7998 // Stores must share the same source value with different offsets.
7999 // Truncate and extends should be stripped to get the single source value.
8001 SourceValue = WideVal;
8002 else if (stripTruncAndExt(SourceValue) != stripTruncAndExt(WideVal))
8004 else if (SourceValue.getValueType() != WideVT) {
8005 if (WideVal.getValueType() == WideVT ||
8006 WideVal.getScalarValueSizeInBits() >
8007 SourceValue.getScalarValueSizeInBits())
8008 SourceValue = WideVal;
8009 // Give up if the source value type is smaller than the store size.
8010 if (SourceValue.getScalarValueSizeInBits() < WideVT.getScalarSizeInBits())
8014 // Stores must share the same base address.
8015 BaseIndexOffset Ptr = BaseIndexOffset::match(Store, DAG);
8016 int64_t ByteOffsetFromBase = 0;
8019 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
8022 // Remember the first store.
8023 if (ByteOffsetFromBase < FirstOffset) {
8025 FirstOffset = ByteOffsetFromBase;
8027 // Map the offset in the store and the offset in the combined value, and
8028 // early return if it has been set before.
8029 if (Offset < 0 || Offset >= NumStores || OffsetMap[Offset] != INT64_MAX)
8031 OffsetMap[Offset] = ByteOffsetFromBase;
8034 assert(FirstOffset != INT64_MAX && "First byte offset must be set");
8035 assert(FirstStore && "First store must be set");
8037 // Check that a store of the wide type is both allowed and fast on the target
8038 const DataLayout &Layout = DAG.getDataLayout();
8040 bool Allowed = TLI.allowsMemoryAccess(Context, Layout, WideVT,
8041 *FirstStore->getMemOperand(), &Fast);
8042 if (!Allowed || !Fast)
8045 // Check if the pieces of the value are going to the expected places in memory
8046 // to merge the stores.
8047 auto checkOffsets = [&](bool MatchLittleEndian) {
8048 if (MatchLittleEndian) {
8049 for (unsigned i = 0; i != NumStores; ++i)
8050 if (OffsetMap[i] != i * (NarrowNumBits / 8) + FirstOffset)
8052 } else { // MatchBigEndian by reversing loop counter.
8053 for (unsigned i = 0, j = NumStores - 1; i != NumStores; ++i, --j)
8054 if (OffsetMap[j] != i * (NarrowNumBits / 8) + FirstOffset)
8060 // Check if the offsets line up for the native data layout of this target.
8061 bool NeedBswap = false;
8062 bool NeedRotate = false;
8063 if (!checkOffsets(Layout.isLittleEndian())) {
8064 // Special-case: check if byte offsets line up for the opposite endian.
8065 if (NarrowNumBits == 8 && checkOffsets(Layout.isBigEndian()))
8067 else if (NumStores == 2 && checkOffsets(Layout.isBigEndian()))
8074 if (WideVT != SourceValue.getValueType()) {
8075 assert(SourceValue.getValueType().getScalarSizeInBits() > WideNumBits &&
8076 "Unexpected store value to merge");
8077 SourceValue = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SourceValue);
8080 // Before legalize we can introduce illegal bswaps/rotates which will be later
8081 // converted to an explicit bswap sequence. This way we end up with a single
8082 // store and byte shuffling instead of several stores and byte shuffling.
8084 SourceValue = DAG.getNode(ISD::BSWAP, DL, WideVT, SourceValue);
8085 } else if (NeedRotate) {
8086 assert(WideNumBits % 2 == 0 && "Unexpected type for rotate");
8087 SDValue RotAmt = DAG.getConstant(WideNumBits / 2, DL, WideVT);
8088 SourceValue = DAG.getNode(ISD::ROTR, DL, WideVT, SourceValue, RotAmt);
8092 DAG.getStore(Chain, DL, SourceValue, FirstStore->getBasePtr(),
8093 FirstStore->getPointerInfo(), FirstStore->getAlign());
8095 // Rely on other DAG combine rules to remove the other individual stores.
8096 DAG.ReplaceAllUsesWith(N, NewStore.getNode());
8100 /// Match a pattern where a wide type scalar value is loaded by several narrow
8101 /// loads and combined by shifts and ors. Fold it into a single load or a load
8102 /// and a BSWAP if the targets supports it.
8104 /// Assuming little endian target:
8106 /// i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
8108 /// i32 val = *((i32)a)
8111 /// i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
8113 /// i32 val = BSWAP(*((i32)a))
8115 /// TODO: This rule matches complex patterns with OR node roots and doesn't
8116 /// interact well with the worklist mechanism. When a part of the pattern is
8117 /// updated (e.g. one of the loads) its direct users are put into the worklist,
8118 /// but the root node of the pattern which triggers the load combine is not
8119 /// necessarily a direct user of the changed node. For example, once the address
8120 /// of t28 load is reassociated load combine won't be triggered:
8121 /// t25: i32 = add t4, Constant:i32<2>
8122 /// t26: i64 = sign_extend t25
8123 /// t27: i64 = add t2, t26
8124 /// t28: i8,ch = load<LD1[%tmp9]> t0, t27, undef:i64
8125 /// t29: i32 = zero_extend t28
8126 /// t32: i32 = shl t29, Constant:i8<8>
8127 /// t33: i32 = or t23, t32
8128 /// As a possible fix visitLoad can check if the load can be a part of a load
8129 /// combine pattern and add corresponding OR roots to the worklist.
8130 SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
8131 assert(N->getOpcode() == ISD::OR &&
8132 "Can only match load combining against OR nodes");
8134 // Handles simple types only
8135 EVT VT = N->getValueType(0);
8136 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
8138 unsigned ByteWidth = VT.getSizeInBits() / 8;
8140 bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
8141 auto MemoryByteOffset = [&] (ByteProvider P) {
8142 assert(P.isMemory() && "Must be a memory byte provider");
8143 unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
8144 assert(LoadBitWidth % 8 == 0 &&
8145 "can only analyze providers for individual bytes not bit");
8146 unsigned LoadByteWidth = LoadBitWidth / 8;
8147 return IsBigEndianTarget
8148 ? bigEndianByteAt(LoadByteWidth, P.ByteOffset)
8149 : littleEndianByteAt(LoadByteWidth, P.ByteOffset);
8152 Optional<BaseIndexOffset> Base;
8155 SmallPtrSet<LoadSDNode *, 8> Loads;
8156 Optional<ByteProvider> FirstByteProvider;
8157 int64_t FirstOffset = INT64_MAX;
8159 // Check if all the bytes of the OR we are looking at are loaded from the same
8160 // base address. Collect bytes offsets from Base address in ByteOffsets.
8161 SmallVector<int64_t, 8> ByteOffsets(ByteWidth);
8162 unsigned ZeroExtendedBytes = 0;
8163 for (int i = ByteWidth - 1; i >= 0; --i) {
8164 auto P = calculateByteProvider(SDValue(N, 0), i, 0, /*Root=*/true);
8168 if (P->isConstantZero()) {
8169 // It's OK for the N most significant bytes to be 0, we can just
8170 // zero-extend the load.
8171 if (++ZeroExtendedBytes != (ByteWidth - static_cast<unsigned>(i)))
8175 assert(P->isMemory() && "provenance should either be memory or zero");
8177 LoadSDNode *L = P->Load;
8178 assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
8180 "Must be enforced by calculateByteProvider");
8181 assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
8183 // All loads must share the same chain
8184 SDValue LChain = L->getChain();
8187 else if (Chain != LChain)
8190 // Loads must share the same base address
8191 BaseIndexOffset Ptr = BaseIndexOffset::match(L, DAG);
8192 int64_t ByteOffsetFromBase = 0;
8195 else if (!Base->equalBaseIndex(Ptr, DAG, ByteOffsetFromBase))
8198 // Calculate the offset of the current byte from the base address
8199 ByteOffsetFromBase += MemoryByteOffset(*P);
8200 ByteOffsets[i] = ByteOffsetFromBase;
8202 // Remember the first byte load
8203 if (ByteOffsetFromBase < FirstOffset) {
8204 FirstByteProvider = P;
8205 FirstOffset = ByteOffsetFromBase;
8210 assert(!Loads.empty() && "All the bytes of the value must be loaded from "
8211 "memory, so there must be at least one load which produces the value");
8212 assert(Base && "Base address of the accessed memory location must be set");
8213 assert(FirstOffset != INT64_MAX && "First byte offset must be set");
8215 bool NeedsZext = ZeroExtendedBytes > 0;
8218 EVT::getIntegerVT(*DAG.getContext(), (ByteWidth - ZeroExtendedBytes) * 8);
8220 if (!MemVT.isSimple())
8223 // Before legalize we can introduce too wide illegal loads which will be later
8224 // split into legal sized loads. This enables us to combine i64 load by i8
8225 // patterns to a couple of i32 loads on 32 bit targets.
8226 if (LegalOperations &&
8227 !TLI.isOperationLegal(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD,
8231 // Check if the bytes of the OR we are looking at match with either big or
8232 // little endian value load
8233 Optional<bool> IsBigEndian = isBigEndian(
8234 makeArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
8238 assert(FirstByteProvider && "must be set");
8240 // Ensure that the first byte is loaded from zero offset of the first load.
8241 // So the combined value can be loaded from the first load address.
8242 if (MemoryByteOffset(*FirstByteProvider) != 0)
8244 LoadSDNode *FirstLoad = FirstByteProvider->Load;
8246 // The node we are looking at matches with the pattern, check if we can
8247 // replace it with a single (possibly zero-extended) load and bswap + shift if
8250 // If the load needs byte swap check if the target supports it
8251 bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
8253 // Before legalize we can introduce illegal bswaps which will be later
8254 // converted to an explicit bswap sequence. This way we end up with a single
8255 // load and byte shuffling instead of several loads and byte shuffling.
8256 // We do not introduce illegal bswaps when zero-extending as this tends to
8257 // introduce too many arithmetic instructions.
8258 if (NeedsBswap && (LegalOperations || NeedsZext) &&
8259 !TLI.isOperationLegal(ISD::BSWAP, VT))
8262 // If we need to bswap and zero extend, we have to insert a shift. Check that
8264 if (NeedsBswap && NeedsZext && LegalOperations &&
8265 !TLI.isOperationLegal(ISD::SHL, VT))
8268 // Check that a load of the wide type is both allowed and fast on the target
8271 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
8272 *FirstLoad->getMemOperand(), &Fast);
8273 if (!Allowed || !Fast)
8277 DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT,
8278 Chain, FirstLoad->getBasePtr(),
8279 FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign());
8281 // Transfer chain users from old loads to the new load.
8282 for (LoadSDNode *L : Loads)
8283 DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
8288 SDValue ShiftedLoad =
8290 ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
8291 DAG.getShiftAmountConstant(ZeroExtendedBytes * 8, VT,
8292 SDLoc(N), LegalOperations))
8294 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
8297 // If the target has andn, bsl, or a similar bit-select instruction,
8298 // we want to unfold masked merge, with canonical pattern of:
8300 // ((x ^ y) & m) ^ y
8303 // (x & m) | (y & ~m)
8304 // If y is a constant, m is not a 'not', and the 'andn' does not work with
8305 // immediates, we unfold into a different pattern:
8306 // ~(~x & m) & (m | y)
8307 // If x is a constant, m is a 'not', and the 'andn' does not work with
8308 // immediates, we unfold into a different pattern:
8309 // (x | ~m) & ~(~m & ~y)
8310 // NOTE: we don't unfold the pattern if 'xor' is actually a 'not', because at
8311 // the very least that breaks andnpd / andnps patterns, and because those
8312 // patterns are simplified in IR and shouldn't be created in the DAG
8313 SDValue DAGCombiner::unfoldMaskedMerge(SDNode *N) {
8314 assert(N->getOpcode() == ISD::XOR);
8316 // Don't touch 'not' (i.e. where y = -1).
8317 if (isAllOnesOrAllOnesSplat(N->getOperand(1)))
8320 EVT VT = N->getValueType(0);
8322 // There are 3 commutable operators in the pattern,
8323 // so we have to deal with 8 possible variants of the basic pattern.
8325 auto matchAndXor = [&X, &Y, &M](SDValue And, unsigned XorIdx, SDValue Other) {
8326 if (And.getOpcode() != ISD::AND || !And.hasOneUse())
8328 SDValue Xor = And.getOperand(XorIdx);
8329 if (Xor.getOpcode() != ISD::XOR || !Xor.hasOneUse())
8331 SDValue Xor0 = Xor.getOperand(0);
8332 SDValue Xor1 = Xor.getOperand(1);
8333 // Don't touch 'not' (i.e. where y = -1).
8334 if (isAllOnesOrAllOnesSplat(Xor1))
8337 std::swap(Xor0, Xor1);
8342 M = And.getOperand(XorIdx ? 0 : 1);
8346 SDValue N0 = N->getOperand(0);
8347 SDValue N1 = N->getOperand(1);
8348 if (!matchAndXor(N0, 0, N1) && !matchAndXor(N0, 1, N1) &&
8349 !matchAndXor(N1, 0, N0) && !matchAndXor(N1, 1, N0))
8352 // Don't do anything if the mask is constant. This should not be reachable.
8353 // InstCombine should have already unfolded this pattern, and DAGCombiner
8354 // probably shouldn't produce it, too.
8355 if (isa<ConstantSDNode>(M.getNode()))
8358 // We can transform if the target has AndNot
8359 if (!TLI.hasAndNot(M))
8364 // If Y is a constant, check that 'andn' works with immediates. Unless M is
8365 // a bitwise not that would already allow ANDN to be used.
8366 if (!TLI.hasAndNot(Y) && !isBitwiseNot(M)) {
8367 assert(TLI.hasAndNot(X) && "Only mask is a variable? Unreachable.");
8368 // If not, we need to do a bit more work to make sure andn is still used.
8369 SDValue NotX = DAG.getNOT(DL, X, VT);
8370 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, NotX, M);
8371 SDValue NotLHS = DAG.getNOT(DL, LHS, VT);
8372 SDValue RHS = DAG.getNode(ISD::OR, DL, VT, M, Y);
8373 return DAG.getNode(ISD::AND, DL, VT, NotLHS, RHS);
8376 // If X is a constant and M is a bitwise not, check that 'andn' works with
8378 if (!TLI.hasAndNot(X) && isBitwiseNot(M)) {
8379 assert(TLI.hasAndNot(Y) && "Only mask is a variable? Unreachable.");
8380 // If not, we need to do a bit more work to make sure andn is still used.
8381 SDValue NotM = M.getOperand(0);
8382 SDValue LHS = DAG.getNode(ISD::OR, DL, VT, X, NotM);
8383 SDValue NotY = DAG.getNOT(DL, Y, VT);
8384 SDValue RHS = DAG.getNode(ISD::AND, DL, VT, NotM, NotY);
8385 SDValue NotRHS = DAG.getNOT(DL, RHS, VT);
8386 return DAG.getNode(ISD::AND, DL, VT, LHS, NotRHS);
8389 SDValue LHS = DAG.getNode(ISD::AND, DL, VT, X, M);
8390 SDValue NotM = DAG.getNOT(DL, M, VT);
8391 SDValue RHS = DAG.getNode(ISD::AND, DL, VT, Y, NotM);
8393 return DAG.getNode(ISD::OR, DL, VT, LHS, RHS);
8396 SDValue DAGCombiner::visitXOR(SDNode *N) {
8397 SDValue N0 = N->getOperand(0);
8398 SDValue N1 = N->getOperand(1);
8399 EVT VT = N0.getValueType();
8402 // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
8403 if (N0.isUndef() && N1.isUndef())
8404 return DAG.getConstant(0, DL, VT);
8406 // fold (xor x, undef) -> undef
8412 // fold (xor c1, c2) -> c1^c2
8413 if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1}))
8416 // canonicalize constant to RHS
8417 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
8418 !DAG.isConstantIntBuildVectorOrConstantInt(N1))
8419 return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
8422 if (VT.isVector()) {
8423 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
8426 // fold (xor x, 0) -> x, vector edition
8427 if (ISD::isConstantSplatVectorAllZeros(N1.getNode()))
8431 // fold (xor x, 0) -> x
8432 if (isNullConstant(N1))
8435 if (SDValue NewSel = foldBinOpIntoSelect(N))
8439 if (SDValue RXOR = reassociateOps(ISD::XOR, DL, N0, N1, N->getFlags()))
8442 // look for 'add-like' folds:
8443 // XOR(N0,MIN_SIGNED_VALUE) == ADD(N0,MIN_SIGNED_VALUE)
8444 if ((!LegalOperations || TLI.isOperationLegal(ISD::ADD, VT)) &&
8445 isMinSignedConstant(N1))
8446 if (SDValue Combined = visitADDLike(N))
8449 // fold !(x cc y) -> (x !cc y)
8450 unsigned N0Opcode = N0.getOpcode();
8451 SDValue LHS, RHS, CC;
8452 if (TLI.isConstTrueVal(N1) &&
8453 isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/ true)) {
8454 ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
8455 LHS.getValueType());
8456 if (!LegalOperations ||
8457 TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
8460 llvm_unreachable("Unhandled SetCC Equivalent!");
8462 return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
8463 case ISD::SELECT_CC:
8464 return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
8465 N0.getOperand(3), NotCC);
8466 case ISD::STRICT_FSETCC:
8467 case ISD::STRICT_FSETCCS: {
8468 if (N0.hasOneUse()) {
8469 // FIXME Can we handle multiple uses? Could we token factor the chain
8470 // results from the new/old setcc?
8472 DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
8473 N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS);
8474 CombineTo(N, SetCC);
8475 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
8476 recursivelyDeleteUnusedNodes(N0.getNode());
8477 return SDValue(N, 0); // Return N so it doesn't get rechecked!
8485 // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
8486 if (isOneConstant(N1) && N0Opcode == ISD::ZERO_EXTEND && N0.hasOneUse() &&
8487 isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
8488 SDValue V = N0.getOperand(0);
8490 V = DAG.getNode(ISD::XOR, DL0, V.getValueType(), V,
8491 DAG.getConstant(1, DL0, V.getValueType()));
8492 AddToWorklist(V.getNode());
8493 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, V);
8496 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
8497 if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
8498 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
8499 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
8500 if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) {
8501 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
8502 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
8503 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
8504 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
8505 return DAG.getNode(NewOpcode, DL, VT, N00, N01);
8508 // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
8509 if (isAllOnesConstant(N1) && N0.hasOneUse() &&
8510 (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
8511 SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
8512 if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) {
8513 unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
8514 N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
8515 N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
8516 AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
8517 return DAG.getNode(NewOpcode, DL, VT, N00, N01);
8521 // fold (not (neg x)) -> (add X, -1)
8522 // FIXME: This can be generalized to (not (sub Y, X)) -> (add X, ~Y) if
8523 // Y is a constant or the subtract has a single use.
8524 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::SUB &&
8525 isNullConstant(N0.getOperand(0))) {
8526 return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(1),
8527 DAG.getAllOnesConstant(DL, VT));
8530 // fold (not (add X, -1)) -> (neg X)
8531 if (isAllOnesConstant(N1) && N0.getOpcode() == ISD::ADD &&
8532 isAllOnesOrAllOnesSplat(N0.getOperand(1))) {
8533 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
8537 // fold (xor (and x, y), y) -> (and (not x), y)
8538 if (N0Opcode == ISD::AND && N0.hasOneUse() && N0->getOperand(1) == N1) {
8539 SDValue X = N0.getOperand(0);
8540 SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
8541 AddToWorklist(NotX.getNode());
8542 return DAG.getNode(ISD::AND, DL, VT, NotX, N1);
8545 // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
8546 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
8547 SDValue A = N0Opcode == ISD::ADD ? N0 : N1;
8548 SDValue S = N0Opcode == ISD::SRA ? N0 : N1;
8549 if (A.getOpcode() == ISD::ADD && S.getOpcode() == ISD::SRA) {
8550 SDValue A0 = A.getOperand(0), A1 = A.getOperand(1);
8551 SDValue S0 = S.getOperand(0);
8552 if ((A0 == S && A1 == S0) || (A1 == S && A0 == S0))
8553 if (ConstantSDNode *C = isConstOrConstSplat(S.getOperand(1)))
8554 if (C->getAPIntValue() == (VT.getScalarSizeInBits() - 1))
8555 return DAG.getNode(ISD::ABS, DL, VT, S0);
8559 // fold (xor x, x) -> 0
8561 return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
8563 // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
8564 // Here is a concrete example of this equivalence:
8566 // i16 shl == 1 << 14 == 16384 == 0b0100000000000000
8567 // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
8571 // i16 ~1 == 0b1111111111111110
8572 // i16 rol(~1, 14) == 0b1011111111111111
8574 // Some additional tips to help conceptualize this transform:
8575 // - Try to see the operation as placing a single zero in a value of all ones.
8576 // - There exists no value for x which would allow the result to contain zero.
8577 // - Values of x larger than the bitwidth are undefined and do not require a
8578 // consistent result.
8579 // - Pushing the zero left requires shifting one bits in from the right.
8580 // A rotate left of ~1 is a nice way of achieving the desired result.
8581 if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0Opcode == ISD::SHL &&
8582 isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
8583 return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
8587 // Simplify: xor (op x...), (op y...) -> (op (xor x, y))
8588 if (N0Opcode == N1.getOpcode())
8589 if (SDValue V = hoistLogicOpWithSameOpcodeHands(N))
8592 if (SDValue R = foldLogicOfShifts(N, N0, N1, DAG))
8594 if (SDValue R = foldLogicOfShifts(N, N1, N0, DAG))
8597 // Unfold ((x ^ y) & m) ^ y into (x & m) | (y & ~m) if profitable
8598 if (SDValue MM = unfoldMaskedMerge(N))
8601 // Simplify the expression using non-local knowledge.
8602 if (SimplifyDemandedBits(SDValue(N, 0)))
8603 return SDValue(N, 0);
8605 if (SDValue Combined = combineCarryDiamond(DAG, TLI, N0, N1, N))
8611 /// If we have a shift-by-constant of a bitwise logic op that itself has a
8612 /// shift-by-constant operand with identical opcode, we may be able to convert
8613 /// that into 2 independent shifts followed by the logic op. This is a
8614 /// throughput improvement.
8615 static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
8616 // Match a one-use bitwise logic op.
8617 SDValue LogicOp = Shift->getOperand(0);
8618 if (!LogicOp.hasOneUse())
8621 unsigned LogicOpcode = LogicOp.getOpcode();
8622 if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
8623 LogicOpcode != ISD::XOR)
8626 // Find a matching one-use shift by constant.
8627 unsigned ShiftOpcode = Shift->getOpcode();
8628 SDValue C1 = Shift->getOperand(1);
8629 ConstantSDNode *C1Node = isConstOrConstSplat(C1);
8630 assert(C1Node && "Expected a shift with constant operand");
8631 const APInt &C1Val = C1Node->getAPIntValue();
8632 auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
8633 const APInt *&ShiftAmtVal) {
8634 if (V.getOpcode() != ShiftOpcode || !V.hasOneUse())
8637 ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
8641 // Capture the shifted operand and shift amount value.
8642 ShiftOp = V.getOperand(0);
8643 ShiftAmtVal = &ShiftCNode->getAPIntValue();
8645 // Shift amount types do not have to match their operand type, so check that
8646 // the constants are the same width.
8647 if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
8650 // The fold is not valid if the sum of the shift values exceeds bitwidth.
8651 if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
8657 // Logic ops are commutative, so check each operand for a match.
8660 if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
8661 Y = LogicOp.getOperand(1);
8662 else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
8663 Y = LogicOp.getOperand(0);
8667 // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
8669 EVT VT = Shift->getValueType(0);
8670 EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
8671 SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
8672 SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
8673 SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
8674 return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
8677 /// Handle transforms common to the three shifts, when the shift amount is a
8679 /// We are looking for: (shift being one of shl/sra/srl)
8680 /// shift (binop X, C0), C1
8681 /// And want to transform into:
8682 /// binop (shift X, C1), (shift C0, C1)
8683 SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
8684 assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand");
8686 // Do not turn a 'not' into a regular xor.
8687 if (isBitwiseNot(N->getOperand(0)))
8690 // The inner binop must be one-use, since we want to replace it.
8691 SDValue LHS = N->getOperand(0);
8692 if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
8695 // TODO: This is limited to early combining because it may reveal regressions
8696 // otherwise. But since we just checked a target hook to see if this is
8697 // desirable, that should have filtered out cases where this interferes
8698 // with some other pattern matching.
8700 if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
8703 // We want to pull some binops through shifts, so that we have (and (shift))
8704 // instead of (shift (and)), likewise for add, or, xor, etc. This sort of
8705 // thing happens with address calculations, so it's important to canonicalize
8707 switch (LHS.getOpcode()) {
8715 if (N->getOpcode() != ISD::SHL)
8716 return SDValue(); // only shl(add) not sr[al](add).
8720 // We require the RHS of the binop to be a constant and not opaque as well.
8721 ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1));
8725 // FIXME: disable this unless the input to the binop is a shift by a constant
8726 // or is copy/select. Enable this in other cases when figure out it's exactly
8728 SDValue BinOpLHSVal = LHS.getOperand(0);
8729 bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL ||
8730 BinOpLHSVal.getOpcode() == ISD::SRA ||
8731 BinOpLHSVal.getOpcode() == ISD::SRL) &&
8732 isa<ConstantSDNode>(BinOpLHSVal.getOperand(1));
8733 bool IsCopyOrSelect = BinOpLHSVal.getOpcode() == ISD::CopyFromReg ||
8734 BinOpLHSVal.getOpcode() == ISD::SELECT;
8736 if (!IsShiftByConstant && !IsCopyOrSelect)
8739 if (IsCopyOrSelect && N->hasOneUse())
8742 // Fold the constants, shifting the binop RHS by the shift amount.
8744 EVT VT = N->getValueType(0);
8745 SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1),
8747 assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");
8749 SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0),
8751 return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS);
8754 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
8755 assert(N->getOpcode() == ISD::TRUNCATE);
8756 assert(N->getOperand(0).getOpcode() == ISD::AND);
8758 // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
8759 EVT TruncVT = N->getValueType(0);
8760 if (N->hasOneUse() && N->getOperand(0).hasOneUse() &&
8761 TLI.isTypeDesirableForOp(ISD::AND, TruncVT)) {
8762 SDValue N01 = N->getOperand(0).getOperand(1);
8763 if (isConstantOrConstantVector(N01, /* NoOpaques */ true)) {
8765 SDValue N00 = N->getOperand(0).getOperand(0);
8766 SDValue Trunc00 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00);
8767 SDValue Trunc01 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N01);
8768 AddToWorklist(Trunc00.getNode());
8769 AddToWorklist(Trunc01.getNode());
8770 return DAG.getNode(ISD::AND, DL, TruncVT, Trunc00, Trunc01);
8777 SDValue DAGCombiner::visitRotate(SDNode *N) {
8779 SDValue N0 = N->getOperand(0);
8780 SDValue N1 = N->getOperand(1);
8781 EVT VT = N->getValueType(0);
8782 unsigned Bitsize = VT.getScalarSizeInBits();
8784 // fold (rot x, 0) -> x
8785 if (isNullOrNullSplat(N1))
8788 // fold (rot x, c) -> x iff (c % BitSize) == 0
8789 if (isPowerOf2_32(Bitsize) && Bitsize > 1) {
8790 APInt ModuloMask(N1.getScalarValueSizeInBits(), Bitsize - 1);
8791 if (DAG.MaskedValueIsZero(N1, ModuloMask))
8795 // fold (rot x, c) -> (rot x, c % BitSize)
8796 bool OutOfRange = false;
8797 auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) {
8798 OutOfRange |= C->getAPIntValue().uge(Bitsize);
8801 if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) {
8802 EVT AmtVT = N1.getValueType();
8803 SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT);
8805 DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits}))
8806 return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt);
8809 // rot i16 X, 8 --> bswap X
8810 auto *RotAmtC = isConstOrConstSplat(N1);
8811 if (RotAmtC && RotAmtC->getAPIntValue() == 8 &&
8812 VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT))
8813 return DAG.getNode(ISD::BSWAP, dl, VT, N0);
8815 // Simplify the operands using demanded-bits information.
8816 if (SimplifyDemandedBits(SDValue(N, 0)))
8817 return SDValue(N, 0);
8819 // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
8820 if (N1.getOpcode() == ISD::TRUNCATE &&
8821 N1.getOperand(0).getOpcode() == ISD::AND) {
8822 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
8823 return DAG.getNode(N->getOpcode(), dl, VT, N0, NewOp1);
8826 unsigned NextOp = N0.getOpcode();
8828 // fold (rot* (rot* x, c2), c1)
8829 // -> (rot* x, ((c1 % bitsize) +- (c2 % bitsize)) % bitsize)
8830 if (NextOp == ISD::ROTL || NextOp == ISD::ROTR) {
8831 SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N1);
8832 SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1));
8833 if (C1 && C2 && C1->getValueType(0) == C2->getValueType(0)) {
8834 EVT ShiftVT = C1->getValueType(0);
8835 bool SameSide = (N->getOpcode() == NextOp);
8836 unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
8837 SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
8838 SDValue Norm1 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
8840 SDValue Norm2 = DAG.FoldConstantArithmetic(ISD::UREM, dl, ShiftVT,
8841 {N0.getOperand(1), BitsizeC});
8843 if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
8844 CombineOp, dl, ShiftVT, {Norm1, Norm2})) {
8845 SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
8846 ISD::UREM, dl, ShiftVT, {CombinedShift, BitsizeC});
8847 return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
8855 SDValue DAGCombiner::visitSHL(SDNode *N) {
8856 SDValue N0 = N->getOperand(0);
8857 SDValue N1 = N->getOperand(1);
8858 if (SDValue V = DAG.simplifyShift(N0, N1))
8861 EVT VT = N0.getValueType();
8862 EVT ShiftVT = N1.getValueType();
8863 unsigned OpSizeInBits = VT.getScalarSizeInBits();
8865 // fold (shl c1, c2) -> c1<<c2
8866 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
8870 if (VT.isVector()) {
8871 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
8874 BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
8875 // If setcc produces all-one true value then:
8876 // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
8877 if (N1CV && N1CV->isConstant()) {
8878 if (N0.getOpcode() == ISD::AND) {
8879 SDValue N00 = N0->getOperand(0);
8880 SDValue N01 = N0->getOperand(1);
8881 BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);
8883 if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
8884 TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
8885 TargetLowering::ZeroOrNegativeOneBooleanContent) {
8887 DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))
8888 return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
8894 if (SDValue NewSel = foldBinOpIntoSelect(N))
8897 // if (shl x, c) is known to be zero, return 0
8898 if (DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
8899 return DAG.getConstant(0, SDLoc(N), VT);
8901 // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
8902 if (N1.getOpcode() == ISD::TRUNCATE &&
8903 N1.getOperand(0).getOpcode() == ISD::AND) {
8904 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
8905 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
8908 if (SimplifyDemandedBits(SDValue(N, 0)))
8909 return SDValue(N, 0);
8911 // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
8912 if (N0.getOpcode() == ISD::SHL) {
8913 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
8914 ConstantSDNode *RHS) {
8915 APInt c1 = LHS->getAPIntValue();
8916 APInt c2 = RHS->getAPIntValue();
8917 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8918 return (c1 + c2).uge(OpSizeInBits);
8920 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
8921 return DAG.getConstant(0, SDLoc(N), VT);
8923 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
8924 ConstantSDNode *RHS) {
8925 APInt c1 = LHS->getAPIntValue();
8926 APInt c2 = RHS->getAPIntValue();
8927 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8928 return (c1 + c2).ult(OpSizeInBits);
8930 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
8932 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
8933 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Sum);
8937 // fold (shl (ext (shl x, c1)), c2) -> (shl (ext x), (add c1, c2))
8938 // For this to be valid, the second form must not preserve any of the bits
8939 // that are shifted out by the inner shift in the first form. This means
8940 // the outer shift size must be >= the number of bits added by the ext.
8941 // As a corollary, we don't care what kind of ext it is.
8942 if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
8943 N0.getOpcode() == ISD::ANY_EXTEND ||
8944 N0.getOpcode() == ISD::SIGN_EXTEND) &&
8945 N0.getOperand(0).getOpcode() == ISD::SHL) {
8946 SDValue N0Op0 = N0.getOperand(0);
8947 SDValue InnerShiftAmt = N0Op0.getOperand(1);
8948 EVT InnerVT = N0Op0.getValueType();
8949 uint64_t InnerBitwidth = InnerVT.getScalarSizeInBits();
8951 auto MatchOutOfRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
8952 ConstantSDNode *RHS) {
8953 APInt c1 = LHS->getAPIntValue();
8954 APInt c2 = RHS->getAPIntValue();
8955 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8956 return c2.uge(OpSizeInBits - InnerBitwidth) &&
8957 (c1 + c2).uge(OpSizeInBits);
8959 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchOutOfRange,
8960 /*AllowUndefs*/ false,
8961 /*AllowTypeMismatch*/ true))
8962 return DAG.getConstant(0, SDLoc(N), VT);
8964 auto MatchInRange = [OpSizeInBits, InnerBitwidth](ConstantSDNode *LHS,
8965 ConstantSDNode *RHS) {
8966 APInt c1 = LHS->getAPIntValue();
8967 APInt c2 = RHS->getAPIntValue();
8968 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
8969 return c2.uge(OpSizeInBits - InnerBitwidth) &&
8970 (c1 + c2).ult(OpSizeInBits);
8972 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchInRange,
8973 /*AllowUndefs*/ false,
8974 /*AllowTypeMismatch*/ true)) {
8976 SDValue Ext = DAG.getNode(N0.getOpcode(), DL, VT, N0Op0.getOperand(0));
8977 SDValue Sum = DAG.getZExtOrTrunc(InnerShiftAmt, DL, ShiftVT);
8978 Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, Sum, N1);
8979 return DAG.getNode(ISD::SHL, DL, VT, Ext, Sum);
8983 // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
8984 // Only fold this if the inner zext has no other uses to avoid increasing
8985 // the total number of instructions.
8986 if (N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
8987 N0.getOperand(0).getOpcode() == ISD::SRL) {
8988 SDValue N0Op0 = N0.getOperand(0);
8989 SDValue InnerShiftAmt = N0Op0.getOperand(1);
8991 auto MatchEqual = [VT](ConstantSDNode *LHS, ConstantSDNode *RHS) {
8992 APInt c1 = LHS->getAPIntValue();
8993 APInt c2 = RHS->getAPIntValue();
8994 zeroExtendToMatch(c1, c2);
8995 return c1.ult(VT.getScalarSizeInBits()) && (c1 == c2);
8997 if (ISD::matchBinaryPredicate(InnerShiftAmt, N1, MatchEqual,
8998 /*AllowUndefs*/ false,
8999 /*AllowTypeMismatch*/ true)) {
9001 EVT InnerShiftAmtVT = N0Op0.getOperand(1).getValueType();
9002 SDValue NewSHL = DAG.getZExtOrTrunc(N1, DL, InnerShiftAmtVT);
9003 NewSHL = DAG.getNode(ISD::SHL, DL, N0Op0.getValueType(), N0Op0, NewSHL);
9004 AddToWorklist(NewSHL.getNode());
9005 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
9009 if (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) {
9010 auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
9011 ConstantSDNode *RHS) {
9012 const APInt &LHSC = LHS->getAPIntValue();
9013 const APInt &RHSC = RHS->getAPIntValue();
9014 return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
9015 LHSC.getZExtValue() <= RHSC.getZExtValue();
9020 // fold (shl (sr[la] exact X, C1), C2) -> (shl X, (C2-C1)) if C1 <= C2
9021 // fold (shl (sr[la] exact X, C1), C2) -> (sr[la] X, (C2-C1)) if C1 >= C2
9022 if (N0->getFlags().hasExact()) {
9023 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
9024 /*AllowUndefs*/ false,
9025 /*AllowTypeMismatch*/ true)) {
9026 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9027 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
9028 return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
9030 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
9031 /*AllowUndefs*/ false,
9032 /*AllowTypeMismatch*/ true)) {
9033 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9034 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
9035 return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0), Diff);
9039 // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
9040 // (and (srl x, (sub c1, c2), MASK)
9041 // Only fold this if the inner shift has no other uses -- if it does,
9042 // folding this will increase the total number of instructions.
9043 if (N0.getOpcode() == ISD::SRL &&
9044 (N0.getOperand(1) == N1 || N0.hasOneUse()) &&
9045 TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
9046 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
9047 /*AllowUndefs*/ false,
9048 /*AllowTypeMismatch*/ true)) {
9049 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9050 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
9051 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9052 Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N01);
9053 Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, Diff);
9054 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
9055 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9057 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
9058 /*AllowUndefs*/ false,
9059 /*AllowTypeMismatch*/ true)) {
9060 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9061 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
9062 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9063 Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, N1);
9064 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
9065 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9070 // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
9071 if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
9072 isConstantOrConstantVector(N1, /* No Opaques */ true)) {
9074 SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
9075 SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
9076 return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
9079 // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
9080 // fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
9081 // Variant of version done on multiply, except mul by a power of 2 is turned
9083 if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::OR) &&
9085 isConstantOrConstantVector(N1, /* No Opaques */ true) &&
9086 isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true) &&
9087 TLI.isDesirableToCommuteWithShift(N, Level)) {
9088 SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
9089 SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
9090 AddToWorklist(Shl0.getNode());
9091 AddToWorklist(Shl1.getNode());
9092 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
9095 // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
9096 if (N0.getOpcode() == ISD::MUL && N0->hasOneUse()) {
9097 SDValue N01 = N0.getOperand(1);
9099 DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, {N01, N1}))
9100 return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Shl);
9103 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9104 if (N1C && !N1C->isOpaque())
9105 if (SDValue NewSHL = visitShiftByConstant(N))
9108 // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
9109 if (N0.getOpcode() == ISD::VSCALE)
9110 if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) {
9111 const APInt &C0 = N0.getConstantOperandAPInt(0);
9112 const APInt &C1 = NC1->getAPIntValue();
9113 return DAG.getVScale(SDLoc(N), VT, C0 << C1);
9116 // Fold (shl step_vector(C0), C1) to (step_vector(C0 << C1)).
9118 if (N0.getOpcode() == ISD::STEP_VECTOR)
9119 if (ISD::isConstantSplatVector(N1.getNode(), ShlVal)) {
9120 const APInt &C0 = N0.getConstantOperandAPInt(0);
9121 if (ShlVal.ult(C0.getBitWidth())) {
9122 APInt NewStep = C0 << ShlVal;
9123 return DAG.getStepVector(SDLoc(N), VT, NewStep);
9130 // Transform a right shift of a multiply into a multiply-high.
9132 // (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)
9133 // (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)
9134 static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
9135 const TargetLowering &TLI) {
9136 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
9137 "SRL or SRA node is required here!");
9139 // Check the shift amount. Proceed with the transformation if the shift
9140 // amount is constant.
9141 ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));
9147 // The operation feeding into the shift must be a multiply.
9148 SDValue ShiftOperand = N->getOperand(0);
9149 if (ShiftOperand.getOpcode() != ISD::MUL)
9152 // Both operands must be equivalent extend nodes.
9153 SDValue LeftOp = ShiftOperand.getOperand(0);
9154 SDValue RightOp = ShiftOperand.getOperand(1);
9156 bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
9157 bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
9159 if (!IsSignExt && !IsZeroExt)
9162 EVT NarrowVT = LeftOp.getOperand(0).getValueType();
9163 unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
9165 SDValue MulhRightOp;
9166 if (ConstantSDNode *Constant = isConstOrConstSplat(RightOp)) {
9167 unsigned ActiveBits = IsSignExt
9168 ? Constant->getAPIntValue().getMinSignedBits()
9169 : Constant->getAPIntValue().getActiveBits();
9170 if (ActiveBits > NarrowVTSize)
9172 MulhRightOp = DAG.getConstant(
9173 Constant->getAPIntValue().trunc(NarrowVT.getScalarSizeInBits()), DL,
9176 if (LeftOp.getOpcode() != RightOp.getOpcode())
9178 // Check that the two extend nodes are the same type.
9179 if (NarrowVT != RightOp.getOperand(0).getValueType())
9181 MulhRightOp = RightOp.getOperand(0);
9184 EVT WideVT = LeftOp.getValueType();
9185 // Proceed with the transformation if the wide types match.
9186 assert((WideVT == RightOp.getValueType()) &&
9187 "Cannot have a multiply node with two different operand types.");
9189 // Proceed with the transformation if the wide type is twice as large
9190 // as the narrow type.
9191 if (WideVT.getScalarSizeInBits() != 2 * NarrowVTSize)
9194 // Check the shift amount with the narrow type size.
9195 // Proceed with the transformation if the shift amount is the width
9196 // of the narrow type.
9197 unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
9198 if (ShiftAmt != NarrowVTSize)
9201 // If the operation feeding into the MUL is a sign extend (sext),
9202 // we use mulhs. Othewise, zero extends (zext) use mulhu.
9203 unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;
9205 // Combine to mulh if mulh is legal/custom for the narrow type on the target.
9206 if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT))
9210 DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), MulhRightOp);
9211 return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT)
9212 : DAG.getZExtOrTrunc(Result, DL, WideVT));
9215 SDValue DAGCombiner::visitSRA(SDNode *N) {
9216 SDValue N0 = N->getOperand(0);
9217 SDValue N1 = N->getOperand(1);
9218 if (SDValue V = DAG.simplifyShift(N0, N1))
9221 EVT VT = N0.getValueType();
9222 unsigned OpSizeInBits = VT.getScalarSizeInBits();
9224 // fold (sra c1, c2) -> (sra c1, c2)
9225 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
9228 // Arithmetic shifting an all-sign-bit value is a no-op.
9229 // fold (sra 0, x) -> 0
9230 // fold (sra -1, x) -> -1
9231 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits)
9236 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
9239 if (SDValue NewSel = foldBinOpIntoSelect(N))
9242 // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
9244 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9245 if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
9246 unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
9247 EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
9249 ExtVT = EVT::getVectorVT(*DAG.getContext(), ExtVT,
9250 VT.getVectorElementCount());
9251 if (!LegalOperations ||
9252 TLI.getOperationAction(ISD::SIGN_EXTEND_INREG, ExtVT) ==
9253 TargetLowering::Legal)
9254 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
9255 N0.getOperand(0), DAG.getValueType(ExtVT));
9256 // Even if we can't convert to sext_inreg, we might be able to remove
9257 // this shift pair if the input is already sign extended.
9258 if (DAG.ComputeNumSignBits(N0.getOperand(0)) > N1C->getZExtValue())
9259 return N0.getOperand(0);
9262 // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
9263 // clamp (add c1, c2) to max shift.
9264 if (N0.getOpcode() == ISD::SRA) {
9266 EVT ShiftVT = N1.getValueType();
9267 EVT ShiftSVT = ShiftVT.getScalarType();
9268 SmallVector<SDValue, 16> ShiftValues;
9270 auto SumOfShifts = [&](ConstantSDNode *LHS, ConstantSDNode *RHS) {
9271 APInt c1 = LHS->getAPIntValue();
9272 APInt c2 = RHS->getAPIntValue();
9273 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
9274 APInt Sum = c1 + c2;
9276 Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue();
9277 ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT));
9280 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) {
9282 if (N1.getOpcode() == ISD::BUILD_VECTOR)
9283 ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues);
9284 else if (N1.getOpcode() == ISD::SPLAT_VECTOR) {
9285 assert(ShiftValues.size() == 1 &&
9286 "Expected matchBinaryPredicate to return one element for "
9288 ShiftValue = DAG.getSplatVector(ShiftVT, DL, ShiftValues[0]);
9290 ShiftValue = ShiftValues[0];
9291 return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue);
9295 // fold (sra (shl X, m), (sub result_size, n))
9296 // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
9297 // result_size - n != m.
9298 // If truncate is free for the target sext(shl) is likely to result in better
9300 if (N0.getOpcode() == ISD::SHL && N1C) {
9301 // Get the two constanst of the shifts, CN0 = m, CN = n.
9302 const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
9304 LLVMContext &Ctx = *DAG.getContext();
9305 // Determine what the truncate's result bitsize and type would be.
9306 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());
9309 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount());
9311 // Determine the residual right-shift amount.
9312 int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
9314 // If the shift is not a no-op (in which case this should be just a sign
9315 // extend already), the truncated to type is legal, sign_extend is legal
9316 // on that type, and the truncate to that type is both legal and free,
9317 // perform the transform.
9318 if ((ShiftAmt > 0) &&
9319 TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
9320 TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
9321 TLI.isTruncateFree(VT, TruncVT)) {
9323 SDValue Amt = DAG.getConstant(ShiftAmt, DL,
9324 getShiftAmountTy(N0.getOperand(0).getValueType()));
9325 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
9326 N0.getOperand(0), Amt);
9327 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
9329 return DAG.getNode(ISD::SIGN_EXTEND, DL,
9330 N->getValueType(0), Trunc);
9335 // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
9336 // sra (add (shl X, N1C), AddC), N1C -->
9337 // sext (add (trunc X to (width - N1C)), AddC')
9338 // sra (sub AddC, (shl X, N1C)), N1C -->
9339 // sext (sub AddC1',(trunc X to (width - N1C)))
9340 if ((N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB) && N1C &&
9342 bool IsAdd = N0.getOpcode() == ISD::ADD;
9343 SDValue Shl = N0.getOperand(IsAdd ? 0 : 1);
9344 if (Shl.getOpcode() == ISD::SHL && Shl.getOperand(1) == N1 &&
9346 // TODO: AddC does not need to be a splat.
9347 if (ConstantSDNode *AddC =
9348 isConstOrConstSplat(N0.getOperand(IsAdd ? 1 : 0))) {
9349 // Determine what the truncate's type would be and ask the target if
9350 // that is a free operation.
9351 LLVMContext &Ctx = *DAG.getContext();
9352 unsigned ShiftAmt = N1C->getZExtValue();
9353 EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt);
9355 TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorElementCount());
9357 // TODO: The simple type check probably belongs in the default hook
9358 // implementation and/or target-specific overrides (because
9359 // non-simple types likely require masking when legalized), but
9360 // that restriction may conflict with other transforms.
9361 if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&
9362 TLI.isTruncateFree(VT, TruncVT)) {
9364 SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
9366 DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).trunc(
9367 TruncVT.getScalarSizeInBits()),
9371 Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC);
9373 Add = DAG.getNode(ISD::SUB, DL, TruncVT, ShiftC, Trunc);
9374 return DAG.getSExtOrTrunc(Add, DL, VT);
9380 // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
9381 if (N1.getOpcode() == ISD::TRUNCATE &&
9382 N1.getOperand(0).getOpcode() == ISD::AND) {
9383 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
9384 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
9387 // fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
9388 // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
9389 // if c1 is equal to the number of bits the trunc removes
9390 // TODO - support non-uniform vector shift amounts.
9391 if (N0.getOpcode() == ISD::TRUNCATE &&
9392 (N0.getOperand(0).getOpcode() == ISD::SRL ||
9393 N0.getOperand(0).getOpcode() == ISD::SRA) &&
9394 N0.getOperand(0).hasOneUse() &&
9395 N0.getOperand(0).getOperand(1).hasOneUse() && N1C) {
9396 SDValue N0Op0 = N0.getOperand(0);
9397 if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
9398 EVT LargeVT = N0Op0.getValueType();
9399 unsigned TruncBits = LargeVT.getScalarSizeInBits() - OpSizeInBits;
9400 if (LargeShift->getAPIntValue() == TruncBits) {
9402 EVT LargeShiftVT = getShiftAmountTy(LargeVT);
9403 SDValue Amt = DAG.getZExtOrTrunc(N1, DL, LargeShiftVT);
9404 Amt = DAG.getNode(ISD::ADD, DL, LargeShiftVT, Amt,
9405 DAG.getConstant(TruncBits, DL, LargeShiftVT));
9407 DAG.getNode(ISD::SRA, DL, LargeVT, N0Op0.getOperand(0), Amt);
9408 return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
9413 // Simplify, based on bits shifted out of the LHS.
9414 if (SimplifyDemandedBits(SDValue(N, 0)))
9415 return SDValue(N, 0);
9417 // If the sign bit is known to be zero, switch this to a SRL.
9418 if (DAG.SignBitIsZero(N0))
9419 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
9421 if (N1C && !N1C->isOpaque())
9422 if (SDValue NewSRA = visitShiftByConstant(N))
9425 // Try to transform this shift into a multiply-high if
9426 // it matches the appropriate pattern detected in combineShiftToMULH.
9427 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
9430 // Attempt to convert a sra of a load into a narrower sign-extending load.
9431 if (SDValue NarrowLoad = reduceLoadWidth(N))
9437 SDValue DAGCombiner::visitSRL(SDNode *N) {
9438 SDValue N0 = N->getOperand(0);
9439 SDValue N1 = N->getOperand(1);
9440 if (SDValue V = DAG.simplifyShift(N0, N1))
9443 EVT VT = N0.getValueType();
9444 EVT ShiftVT = N1.getValueType();
9445 unsigned OpSizeInBits = VT.getScalarSizeInBits();
9447 // fold (srl c1, c2) -> c1 >>u c2
9448 if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
9453 if (SDValue FoldedVOp = SimplifyVBinOp(N, SDLoc(N)))
9456 if (SDValue NewSel = foldBinOpIntoSelect(N))
9459 // if (srl x, c) is known to be zero, return 0
9460 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9462 DAG.MaskedValueIsZero(SDValue(N, 0), APInt::getAllOnes(OpSizeInBits)))
9463 return DAG.getConstant(0, SDLoc(N), VT);
9465 // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
9466 if (N0.getOpcode() == ISD::SRL) {
9467 auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,
9468 ConstantSDNode *RHS) {
9469 APInt c1 = LHS->getAPIntValue();
9470 APInt c2 = RHS->getAPIntValue();
9471 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
9472 return (c1 + c2).uge(OpSizeInBits);
9474 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
9475 return DAG.getConstant(0, SDLoc(N), VT);
9477 auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
9478 ConstantSDNode *RHS) {
9479 APInt c1 = LHS->getAPIntValue();
9480 APInt c2 = RHS->getAPIntValue();
9481 zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
9482 return (c1 + c2).ult(OpSizeInBits);
9484 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {
9486 SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));
9487 return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Sum);
9491 if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
9492 N0.getOperand(0).getOpcode() == ISD::SRL) {
9493 SDValue InnerShift = N0.getOperand(0);
9494 // TODO - support non-uniform vector shift amounts.
9495 if (auto *N001C = isConstOrConstSplat(InnerShift.getOperand(1))) {
9496 uint64_t c1 = N001C->getZExtValue();
9497 uint64_t c2 = N1C->getZExtValue();
9498 EVT InnerShiftVT = InnerShift.getValueType();
9499 EVT ShiftAmtVT = InnerShift.getOperand(1).getValueType();
9500 uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
9501 // srl (trunc (srl x, c1)), c2 --> 0 or (trunc (srl x, (add c1, c2)))
9502 // This is only valid if the OpSizeInBits + c1 = size of inner shift.
9503 if (c1 + OpSizeInBits == InnerShiftSize) {
9505 if (c1 + c2 >= InnerShiftSize)
9506 return DAG.getConstant(0, DL, VT);
9507 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
9508 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
9509 InnerShift.getOperand(0), NewShiftAmt);
9510 return DAG.getNode(ISD::TRUNCATE, DL, VT, NewShift);
9512 // In the more general case, we can clear the high bits after the shift:
9513 // srl (trunc (srl x, c1)), c2 --> trunc (and (srl x, (c1+c2)), Mask)
9514 if (N0.hasOneUse() && InnerShift.hasOneUse() &&
9515 c1 + c2 < InnerShiftSize) {
9517 SDValue NewShiftAmt = DAG.getConstant(c1 + c2, DL, ShiftAmtVT);
9518 SDValue NewShift = DAG.getNode(ISD::SRL, DL, InnerShiftVT,
9519 InnerShift.getOperand(0), NewShiftAmt);
9520 SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(InnerShiftSize,
9523 SDValue And = DAG.getNode(ISD::AND, DL, InnerShiftVT, NewShift, Mask);
9524 return DAG.getNode(ISD::TRUNCATE, DL, VT, And);
9529 // fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
9530 // (and (srl x, (sub c2, c1), MASK)
9531 if (N0.getOpcode() == ISD::SHL &&
9532 (N0.getOperand(1) == N1 || N0->hasOneUse()) &&
9533 TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
9534 auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
9535 ConstantSDNode *RHS) {
9536 const APInt &LHSC = LHS->getAPIntValue();
9537 const APInt &RHSC = RHS->getAPIntValue();
9538 return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
9539 LHSC.getZExtValue() <= RHSC.getZExtValue();
9541 if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
9542 /*AllowUndefs*/ false,
9543 /*AllowTypeMismatch*/ true)) {
9545 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9546 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
9547 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9548 Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
9549 Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
9550 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
9551 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9553 if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
9554 /*AllowUndefs*/ false,
9555 /*AllowTypeMismatch*/ true)) {
9557 SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
9558 SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
9559 SDValue Mask = DAG.getAllOnesConstant(DL, VT);
9560 Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
9561 SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
9562 return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
9566 // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
9567 // TODO - support non-uniform vector shift amounts.
9568 if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
9569 // Shifting in all undef bits?
9570 EVT SmallVT = N0.getOperand(0).getValueType();
9571 unsigned BitSize = SmallVT.getScalarSizeInBits();
9572 if (N1C->getAPIntValue().uge(BitSize))
9573 return DAG.getUNDEF(VT);
9575 if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
9576 uint64_t ShiftAmt = N1C->getZExtValue();
9578 SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
9580 DAG.getConstant(ShiftAmt, DL0,
9581 getShiftAmountTy(SmallVT)));
9582 AddToWorklist(SmallShift.getNode());
9583 APInt Mask = APInt::getLowBitsSet(OpSizeInBits, OpSizeInBits - ShiftAmt);
9585 return DAG.getNode(ISD::AND, DL, VT,
9586 DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
9587 DAG.getConstant(Mask, DL, VT));
9591 // fold (srl (sra X, Y), 31) -> (srl X, 31). This srl only looks at the sign
9592 // bit, which is unmodified by sra.
9593 if (N1C && N1C->getAPIntValue() == (OpSizeInBits - 1)) {
9594 if (N0.getOpcode() == ISD::SRA)
9595 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
9598 // fold (srl (ctlz x), "5") -> x iff x has one bit set (the low bit).
9599 if (N1C && N0.getOpcode() == ISD::CTLZ &&
9600 N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
9601 KnownBits Known = DAG.computeKnownBits(N0.getOperand(0));
9603 // If any of the input bits are KnownOne, then the input couldn't be all
9604 // zeros, thus the result of the srl will always be zero.
9605 if (Known.One.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
9607 // If all of the bits input the to ctlz node are known to be zero, then
9608 // the result of the ctlz is "32" and the result of the shift is one.
9609 APInt UnknownBits = ~Known.Zero;
9610 if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);
9612 // Otherwise, check to see if there is exactly one bit input to the ctlz.
9613 if (UnknownBits.isPowerOf2()) {
9614 // Okay, we know that only that the single bit specified by UnknownBits
9615 // could be set on input to the CTLZ node. If this bit is set, the SRL
9616 // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
9617 // to an SRL/XOR pair, which is likely to simplify more.
9618 unsigned ShAmt = UnknownBits.countTrailingZeros();
9619 SDValue Op = N0.getOperand(0);
9623 Op = DAG.getNode(ISD::SRL, DL, VT, Op,
9624 DAG.getConstant(ShAmt, DL,
9625 getShiftAmountTy(Op.getValueType())));
9626 AddToWorklist(Op.getNode());
9630 return DAG.getNode(ISD::XOR, DL, VT,
9631 Op, DAG.getConstant(1, DL, VT));
9635 // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
9636 if (N1.getOpcode() == ISD::TRUNCATE &&
9637 N1.getOperand(0).getOpcode() == ISD::AND) {
9638 if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
9639 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
9642 // fold operands of srl based on knowledge that the low bits are not
9644 if (SimplifyDemandedBits(SDValue(N, 0)))
9645 return SDValue(N, 0);
9647 if (N1C && !N1C->isOpaque())
9648 if (SDValue NewSRL = visitShiftByConstant(N))
9651 // Attempt to convert a srl of a load into a narrower zero-extending load.
9652 if (SDValue NarrowLoad = reduceLoadWidth(N))
9655 // Here is a common situation. We want to optimize:
9658 // %b = and i32 %a, 2
9659 // %c = srl i32 %b, 1
9660 // brcond i32 %c ...
9666 // %c = setcc eq %b, 0
9669 // However when after the source operand of SRL is optimized into AND, the SRL
9670 // itself may not be optimized further. Look for it and add the BRCOND into
9672 if (N->hasOneUse()) {
9673 SDNode *Use = *N->use_begin();
9674 if (Use->getOpcode() == ISD::BRCOND)
9676 else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
9677 // Also look pass the truncate.
9678 Use = *Use->use_begin();
9679 if (Use->getOpcode() == ISD::BRCOND)
9684 // Try to transform this shift into a multiply-high if
9685 // it matches the appropriate pattern detected in combineShiftToMULH.
9686 if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
9692 SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
9693 EVT VT = N->getValueType(0);
9694 SDValue N0 = N->getOperand(0);
9695 SDValue N1 = N->getOperand(1);
9696 SDValue N2 = N->getOperand(2);
9697 bool IsFSHL = N->getOpcode() == ISD::FSHL;
9698 unsigned BitWidth = VT.getScalarSizeInBits();
9700 // fold (fshl N0, N1, 0) -> N0
9701 // fold (fshr N0, N1, 0) -> N1
9702 if (isPowerOf2_32(BitWidth))
9703 if (DAG.MaskedValueIsZero(
9704 N2, APInt(N2.getScalarValueSizeInBits(), BitWidth - 1)))
9705 return IsFSHL ? N0 : N1;
9707 auto IsUndefOrZero = [](SDValue V) {
9708 return V.isUndef() || isNullOrNullSplat(V, /*AllowUndefs*/ true);
9711 // TODO - support non-uniform vector shift amounts.
9712 if (ConstantSDNode *Cst = isConstOrConstSplat(N2)) {
9713 EVT ShAmtTy = N2.getValueType();
9715 // fold (fsh* N0, N1, c) -> (fsh* N0, N1, c % BitWidth)
9716 if (Cst->getAPIntValue().uge(BitWidth)) {
9717 uint64_t RotAmt = Cst->getAPIntValue().urem(BitWidth);
9718 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N0, N1,
9719 DAG.getConstant(RotAmt, SDLoc(N), ShAmtTy));
9722 unsigned ShAmt = Cst->getZExtValue();
9724 return IsFSHL ? N0 : N1;
9726 // fold fshl(undef_or_zero, N1, C) -> lshr(N1, BW-C)
9727 // fold fshr(undef_or_zero, N1, C) -> lshr(N1, C)
9728 // fold fshl(N0, undef_or_zero, C) -> shl(N0, C)
9729 // fold fshr(N0, undef_or_zero, C) -> shl(N0, BW-C)
9730 if (IsUndefOrZero(N0))
9731 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1,
9732 DAG.getConstant(IsFSHL ? BitWidth - ShAmt : ShAmt,
9733 SDLoc(N), ShAmtTy));
9734 if (IsUndefOrZero(N1))
9735 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
9736 DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
9737 SDLoc(N), ShAmtTy));
9739 // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
9740 // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
9741 // TODO - bigendian support once we have test coverage.
9742 // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
9743 // TODO - permit LHS EXTLOAD if extensions are shifted out.
9744 if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
9745 !DAG.getDataLayout().isBigEndian()) {
9746 auto *LHS = dyn_cast<LoadSDNode>(N0);
9747 auto *RHS = dyn_cast<LoadSDNode>(N1);
9748 if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
9749 LHS->getAddressSpace() == RHS->getAddressSpace() &&
9750 (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
9751 ISD::isNON_EXTLoad(LHS)) {
9752 if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
9755 IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
9756 Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff);
9758 if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
9759 RHS->getAddressSpace(), NewAlign,
9760 RHS->getMemOperand()->getFlags(), &Fast) &&
9762 SDValue NewPtr = DAG.getMemBasePlusOffset(
9763 RHS->getBasePtr(), TypeSize::Fixed(PtrOff), DL);
9764 AddToWorklist(NewPtr.getNode());
9765 SDValue Load = DAG.getLoad(
9766 VT, DL, RHS->getChain(), NewPtr,
9767 RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
9768 RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
9769 // Replace the old load's chain with the new load's chain.
9770 WorklistRemover DeadNodes(*this);
9771 DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
9779 // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)
9780 // fold fshl(N0, undef_or_zero, N2) -> shl(N0, N2)
9781 // iff We know the shift amount is in range.
9782 // TODO: when is it worth doing SUB(BW, N2) as well?
9783 if (isPowerOf2_32(BitWidth)) {
9784 APInt ModuloBits(N2.getScalarValueSizeInBits(), BitWidth - 1);
9785 if (IsUndefOrZero(N0) && !IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
9786 return DAG.getNode(ISD::SRL, SDLoc(N), VT, N1, N2);
9787 if (IsUndefOrZero(N1) && IsFSHL && DAG.MaskedValueIsZero(N2, ~ModuloBits))
9788 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N2);
9791 // fold (fshl N0, N0, N2) -> (rotl N0, N2)
9792 // fold (fshr N0, N0, N2) -> (rotr N0, N2)
9793 // TODO: Investigate flipping this rotate if only one is legal, if funnel shift
9794 // is legal as well we might be better off avoiding non-constant (BW - N2).
9795 unsigned RotOpc = IsFSHL ? ISD::ROTL : ISD::ROTR;
9796 if (N0 == N1 && hasOperation(RotOpc, VT))
9797 return DAG.getNode(RotOpc, SDLoc(N), VT, N0, N2);
9799 // Simplify, based on bits shifted out of N0/N1.
9800 if (SimplifyDemandedBits(SDValue(N, 0)))
9801 return SDValue(N, 0);
9806 SDValue DAGCombiner::visitSHLSAT(SDNode *N) {
9807 SDValue N0 = N->getOperand(0);
9808 SDValue N1 = N->getOperand(1);
9809 if (SDValue V = DAG.simplifyShift(N0, N1))
9812 EVT VT = N0.getValueType();
9814 // fold (*shlsat c1, c2) -> c1<<c2
9816 DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, {N0, N1}))
9819 ConstantSDNode *N1C = isConstOrConstSplat(N1);
9821 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) {
9822 // fold (sshlsat x, c) -> (shl x, c)
9823 if (N->getOpcode() == ISD::SSHLSAT && N1C &&
9824 N1C->getAPIntValue().ult(DAG.ComputeNumSignBits(N0)))
9825 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
9827 // fold (ushlsat x, c) -> (shl x, c)
9828 if (N->getOpcode() == ISD::USHLSAT && N1C &&
9829 N1C->getAPIntValue().ule(
9830 DAG.computeKnownBits(N0).countMinLeadingZeros()))
9831 return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1);
9837 // Given a ABS node, detect the following pattern:
9838 // (ABS (SUB (EXTEND a), (EXTEND b))).
9839 // Generates UABD/SABD instruction.
9840 static SDValue combineABSToABD(SDNode *N, SelectionDAG &DAG,
9841 const TargetLowering &TLI) {
9842 SDValue AbsOp1 = N->getOperand(0);
9845 if (AbsOp1.getOpcode() != ISD::SUB)
9848 Op0 = AbsOp1.getOperand(0);
9849 Op1 = AbsOp1.getOperand(1);
9851 unsigned Opc0 = Op0.getOpcode();
9852 // Check if the operands of the sub are (zero|sign)-extended.
9853 if (Opc0 != Op1.getOpcode() ||
9854 (Opc0 != ISD::ZERO_EXTEND && Opc0 != ISD::SIGN_EXTEND))
9857 EVT VT = N->getValueType(0);
9858 EVT VT1 = Op0.getOperand(0).getValueType();
9859 EVT VT2 = Op1.getOperand(0).getValueType();
9860 unsigned ABDOpcode = (Opc0 == ISD::SIGN_EXTEND) ? ISD::ABDS : ISD::ABDU;
9862 // fold abs(sext(x) - sext(y)) -> zext(abds(x, y))
9863 // fold abs(zext(x) - zext(y)) -> zext(abdu(x, y))
9864 // NOTE: Extensions must be equivalent.
9865 if (VT1 == VT2 && TLI.isOperationLegalOrCustom(ABDOpcode, VT1)) {
9866 Op0 = Op0.getOperand(0);
9867 Op1 = Op1.getOperand(0);
9868 SDValue ABD = DAG.getNode(ABDOpcode, SDLoc(N), VT1, Op0, Op1);
9869 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, ABD);
9872 // fold abs(sext(x) - sext(y)) -> abds(sext(x), sext(y))
9873 // fold abs(zext(x) - zext(y)) -> abdu(zext(x), zext(y))
9874 if (TLI.isOperationLegalOrCustom(ABDOpcode, VT))
9875 return DAG.getNode(ABDOpcode, SDLoc(N), VT, Op0, Op1);
9880 SDValue DAGCombiner::visitABS(SDNode *N) {
9881 SDValue N0 = N->getOperand(0);
9882 EVT VT = N->getValueType(0);
9884 // fold (abs c1) -> c2
9885 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9886 return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
9887 // fold (abs (abs x)) -> (abs x)
9888 if (N0.getOpcode() == ISD::ABS)
9890 // fold (abs x) -> x iff not-negative
9891 if (DAG.SignBitIsZero(N0))
9894 if (SDValue ABD = combineABSToABD(N, DAG, TLI))
9900 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
9901 SDValue N0 = N->getOperand(0);
9902 EVT VT = N->getValueType(0);
9905 // fold (bswap c1) -> c2
9906 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9907 return DAG.getNode(ISD::BSWAP, DL, VT, N0);
9908 // fold (bswap (bswap x)) -> x
9909 if (N0.getOpcode() == ISD::BSWAP)
9910 return N0.getOperand(0);
9912 // Canonicalize bswap(bitreverse(x)) -> bitreverse(bswap(x)). If bitreverse
9913 // isn't supported, it will be expanded to bswap followed by a manual reversal
9914 // of bits in each byte. By placing bswaps before bitreverse, we can remove
9915 // the two bswaps if the bitreverse gets expanded.
9916 if (N0.getOpcode() == ISD::BITREVERSE && N0.hasOneUse()) {
9917 SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
9918 return DAG.getNode(ISD::BITREVERSE, DL, VT, BSwap);
9921 // fold (bswap shl(x,c)) -> (zext(bswap(trunc(shl(x,sub(c,bw/2))))))
9922 // iff x >= bw/2 (i.e. lower half is known zero)
9923 unsigned BW = VT.getScalarSizeInBits();
9924 if (BW >= 32 && N0.getOpcode() == ISD::SHL && N0.hasOneUse()) {
9925 auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9926 EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), BW / 2);
9927 if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
9928 ShAmt->getZExtValue() >= (BW / 2) &&
9929 (ShAmt->getZExtValue() % 16) == 0 && TLI.isTypeLegal(HalfVT) &&
9930 TLI.isTruncateFree(VT, HalfVT) &&
9931 (!LegalOperations || hasOperation(ISD::BSWAP, HalfVT))) {
9932 SDValue Res = N0.getOperand(0);
9933 if (uint64_t NewShAmt = (ShAmt->getZExtValue() - (BW / 2)))
9934 Res = DAG.getNode(ISD::SHL, DL, VT, Res,
9935 DAG.getConstant(NewShAmt, DL, getShiftAmountTy(VT)));
9936 Res = DAG.getZExtOrTrunc(Res, DL, HalfVT);
9937 Res = DAG.getNode(ISD::BSWAP, DL, HalfVT, Res);
9938 return DAG.getZExtOrTrunc(Res, DL, VT);
9942 // Try to canonicalize bswap-of-logical-shift-by-8-bit-multiple as
9943 // inverse-shift-of-bswap:
9944 // bswap (X u<< C) --> (bswap X) u>> C
9945 // bswap (X u>> C) --> (bswap X) u<< C
9946 if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
9948 auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1));
9949 if (ShAmt && ShAmt->getAPIntValue().ult(BW) &&
9950 ShAmt->getZExtValue() % 8 == 0) {
9951 SDValue NewSwap = DAG.getNode(ISD::BSWAP, DL, VT, N0.getOperand(0));
9952 unsigned InverseShift = N0.getOpcode() == ISD::SHL ? ISD::SRL : ISD::SHL;
9953 return DAG.getNode(InverseShift, DL, VT, NewSwap, N0.getOperand(1));
9960 SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
9961 SDValue N0 = N->getOperand(0);
9962 EVT VT = N->getValueType(0);
9964 // fold (bitreverse c1) -> c2
9965 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9966 return DAG.getNode(ISD::BITREVERSE, SDLoc(N), VT, N0);
9967 // fold (bitreverse (bitreverse x)) -> x
9968 if (N0.getOpcode() == ISD::BITREVERSE)
9969 return N0.getOperand(0);
9973 SDValue DAGCombiner::visitCTLZ(SDNode *N) {
9974 SDValue N0 = N->getOperand(0);
9975 EVT VT = N->getValueType(0);
9977 // fold (ctlz c1) -> c2
9978 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9979 return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
9981 // If the value is known never to be zero, switch to the undef version.
9982 if (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ_ZERO_UNDEF, VT)) {
9983 if (DAG.isKnownNeverZero(N0))
9984 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
9990 SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
9991 SDValue N0 = N->getOperand(0);
9992 EVT VT = N->getValueType(0);
9994 // fold (ctlz_zero_undef c1) -> c2
9995 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
9996 return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
10000 SDValue DAGCombiner::visitCTTZ(SDNode *N) {
10001 SDValue N0 = N->getOperand(0);
10002 EVT VT = N->getValueType(0);
10004 // fold (cttz c1) -> c2
10005 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
10006 return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
10008 // If the value is known never to be zero, switch to the undef version.
10009 if (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ_ZERO_UNDEF, VT)) {
10010 if (DAG.isKnownNeverZero(N0))
10011 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
10017 SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
10018 SDValue N0 = N->getOperand(0);
10019 EVT VT = N->getValueType(0);
10021 // fold (cttz_zero_undef c1) -> c2
10022 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
10023 return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
10027 SDValue DAGCombiner::visitCTPOP(SDNode *N) {
10028 SDValue N0 = N->getOperand(0);
10029 EVT VT = N->getValueType(0);
10031 // fold (ctpop c1) -> c2
10032 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
10033 return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
10037 // FIXME: This should be checking for no signed zeros on individual operands, as
10038 // well as no nans.
10039 static bool isLegalToCombineMinNumMaxNum(SelectionDAG &DAG, SDValue LHS,
10041 const TargetLowering &TLI) {
10042 const TargetOptions &Options = DAG.getTarget().Options;
10043 EVT VT = LHS.getValueType();
10045 return Options.NoSignedZerosFPMath && VT.isFloatingPoint() &&
10046 TLI.isProfitableToCombineMinNumMaxNum(VT) &&
10047 DAG.isKnownNeverNaN(LHS) && DAG.isKnownNeverNaN(RHS);
10050 /// Generate Min/Max node
10051 static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
10052 SDValue RHS, SDValue True, SDValue False,
10053 ISD::CondCode CC, const TargetLowering &TLI,
10054 SelectionDAG &DAG) {
10055 if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
10058 EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
10065 case ISD::SETULE: {
10066 // Since it's known never nan to get here already, either fminnum or
10067 // fminnum_ieee are OK. Try the ieee version first, since it's fminnum is
10068 // expanded in terms of it.
10069 unsigned IEEEOpcode = (LHS == True) ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE;
10070 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
10071 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
10073 unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
10074 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
10075 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
10083 case ISD::SETUGE: {
10084 unsigned IEEEOpcode = (LHS == True) ? ISD::FMAXNUM_IEEE : ISD::FMINNUM_IEEE;
10085 if (TLI.isOperationLegalOrCustom(IEEEOpcode, VT))
10086 return DAG.getNode(IEEEOpcode, DL, VT, LHS, RHS);
10088 unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
10089 if (TLI.isOperationLegalOrCustom(Opcode, TransformVT))
10090 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
10098 /// If a (v)select has a condition value that is a sign-bit test, try to smear
10099 /// the condition operand sign-bit across the value width and use it as a mask.
10100 static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
10101 SDValue Cond = N->getOperand(0);
10102 SDValue C1 = N->getOperand(1);
10103 SDValue C2 = N->getOperand(2);
10104 if (!isConstantOrConstantVector(C1) || !isConstantOrConstantVector(C2))
10107 EVT VT = N->getValueType(0);
10108 if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() ||
10109 VT != Cond.getOperand(0).getValueType())
10112 // The inverted-condition + commuted-select variants of these patterns are
10113 // canonicalized to these forms in IR.
10114 SDValue X = Cond.getOperand(0);
10115 SDValue CondC = Cond.getOperand(1);
10116 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
10117 if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
10118 isAllOnesOrAllOnesSplat(C2)) {
10119 // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1
10121 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
10122 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
10123 return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
10125 if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
10126 // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
10128 SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
10129 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
10130 return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
10135 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
10136 SDValue Cond = N->getOperand(0);
10137 SDValue N1 = N->getOperand(1);
10138 SDValue N2 = N->getOperand(2);
10139 EVT VT = N->getValueType(0);
10140 EVT CondVT = Cond.getValueType();
10143 if (!VT.isInteger())
10146 auto *C1 = dyn_cast<ConstantSDNode>(N1);
10147 auto *C2 = dyn_cast<ConstantSDNode>(N2);
10151 // Only do this before legalization to avoid conflicting with target-specific
10152 // transforms in the other direction (create a select from a zext/sext). There
10153 // is also a target-independent combine here in DAGCombiner in the other
10154 // direction for (select Cond, -1, 0) when the condition is not i1.
10155 if (CondVT == MVT::i1 && !LegalOperations) {
10156 if (C1->isZero() && C2->isOne()) {
10157 // select Cond, 0, 1 --> zext (!Cond)
10158 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
10160 NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
10163 if (C1->isZero() && C2->isAllOnes()) {
10164 // select Cond, 0, -1 --> sext (!Cond)
10165 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
10167 NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
10170 if (C1->isOne() && C2->isZero()) {
10171 // select Cond, 1, 0 --> zext (Cond)
10173 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
10176 if (C1->isAllOnes() && C2->isZero()) {
10177 // select Cond, -1, 0 --> sext (Cond)
10179 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
10183 // Use a target hook because some targets may prefer to transform in the
10184 // other direction.
10185 if (TLI.convertSelectOfConstantsToMath(VT)) {
10186 // For any constants that differ by 1, we can transform the select into an
10188 const APInt &C1Val = C1->getAPIntValue();
10189 const APInt &C2Val = C2->getAPIntValue();
10190 if (C1Val - 1 == C2Val) {
10191 // select Cond, C1, C1-1 --> add (zext Cond), C1-1
10193 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
10194 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
10196 if (C1Val + 1 == C2Val) {
10197 // select Cond, C1, C1+1 --> add (sext Cond), C1+1
10199 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
10200 return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
10203 // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
10204 if (C1Val.isPowerOf2() && C2Val.isZero()) {
10206 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
10208 DAG.getShiftAmountConstant(C1Val.exactLogBase2(), VT, DL);
10209 return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
10212 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
10219 // fold (select Cond, 0, 1) -> (xor Cond, 1)
10220 // We can't do this reliably if integer based booleans have different contents
10221 // to floating point based booleans. This is because we can't tell whether we
10222 // have an integer-based boolean or a floating-point-based boolean unless we
10223 // can find the SETCC that produced it and inspect its operands. This is
10224 // fairly easy if C is the SETCC node, but it can potentially be
10225 // undiscoverable (or not reasonably discoverable). For example, it could be
10226 // in another basic block or it could require searching a complicated
10228 if (CondVT.isInteger() &&
10229 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/true) ==
10230 TargetLowering::ZeroOrOneBooleanContent &&
10231 TLI.getBooleanContents(/*isVec*/false, /*isFloat*/false) ==
10232 TargetLowering::ZeroOrOneBooleanContent &&
10233 C1->isZero() && C2->isOne()) {
10235 DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
10236 if (VT.bitsEq(CondVT))
10238 return DAG.getZExtOrTrunc(NotCond, DL, VT);
10244 static SDValue foldBoolSelectToLogic(SDNode *N, SelectionDAG &DAG) {
10245 assert((N->getOpcode() == ISD::SELECT || N->getOpcode() == ISD::VSELECT) &&
10246 "Expected a (v)select");
10247 SDValue Cond = N->getOperand(0);
10248 SDValue T = N->getOperand(1), F = N->getOperand(2);
10249 EVT VT = N->getValueType(0);
10250 if (VT != Cond.getValueType() || VT.getScalarSizeInBits() != 1)
10253 // select Cond, Cond, F --> or Cond, F
10254 // select Cond, 1, F --> or Cond, F
10255 if (Cond == T || isOneOrOneSplat(T, /* AllowUndefs */ true))
10256 return DAG.getNode(ISD::OR, SDLoc(N), VT, Cond, F);
10258 // select Cond, T, Cond --> and Cond, T
10259 // select Cond, T, 0 --> and Cond, T
10260 if (Cond == F || isNullOrNullSplat(F, /* AllowUndefs */ true))
10261 return DAG.getNode(ISD::AND, SDLoc(N), VT, Cond, T);
10263 // select Cond, T, 1 --> or (not Cond), T
10264 if (isOneOrOneSplat(F, /* AllowUndefs */ true)) {
10265 SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT);
10266 return DAG.getNode(ISD::OR, SDLoc(N), VT, NotCond, T);
10269 // select Cond, 0, F --> and (not Cond), F
10270 if (isNullOrNullSplat(T, /* AllowUndefs */ true)) {
10271 SDValue NotCond = DAG.getNOT(SDLoc(N), Cond, VT);
10272 return DAG.getNode(ISD::AND, SDLoc(N), VT, NotCond, F);
10278 static SDValue foldVSelectToSignBitSplatMask(SDNode *N, SelectionDAG &DAG) {
10279 SDValue N0 = N->getOperand(0);
10280 SDValue N1 = N->getOperand(1);
10281 SDValue N2 = N->getOperand(2);
10282 EVT VT = N->getValueType(0);
10283 if (N0.getOpcode() != ISD::SETCC || !N0.hasOneUse())
10286 SDValue Cond0 = N0.getOperand(0);
10287 SDValue Cond1 = N0.getOperand(1);
10288 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
10289 if (VT != Cond0.getValueType())
10292 // Match a signbit check of Cond0 as "Cond0 s<0". Swap select operands if the
10293 // compare is inverted from that pattern ("Cond0 s> -1").
10294 if (CC == ISD::SETLT && isNullOrNullSplat(Cond1))
10295 ; // This is the pattern we are looking for.
10296 else if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(Cond1))
10301 // (Cond0 s< 0) ? N1 : 0 --> (Cond0 s>> BW-1) & N1
10302 if (isNullOrNullSplat(N2)) {
10304 SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
10305 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
10306 return DAG.getNode(ISD::AND, DL, VT, Sra, N1);
10309 // (Cond0 s< 0) ? -1 : N2 --> (Cond0 s>> BW-1) | N2
10310 if (isAllOnesOrAllOnesSplat(N1)) {
10312 SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
10313 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
10314 return DAG.getNode(ISD::OR, DL, VT, Sra, N2);
10317 // If we have to invert the sign bit mask, only do that transform if the
10318 // target has a bitwise 'and not' instruction (the invert is free).
10319 // (Cond0 s< -0) ? 0 : N2 --> ~(Cond0 s>> BW-1) & N2
10320 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10321 if (isNullOrNullSplat(N1) && TLI.hasAndNot(N1)) {
10323 SDValue ShiftAmt = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
10324 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Cond0, ShiftAmt);
10325 SDValue Not = DAG.getNOT(DL, Sra, VT);
10326 return DAG.getNode(ISD::AND, DL, VT, Not, N2);
10329 // TODO: There's another pattern in this family, but it may require
10330 // implementing hasOrNot() to check for profitability:
10331 // (Cond0 s> -1) ? -1 : N2 --> ~(Cond0 s>> BW-1) | N2
10336 SDValue DAGCombiner::visitSELECT(SDNode *N) {
10337 SDValue N0 = N->getOperand(0);
10338 SDValue N1 = N->getOperand(1);
10339 SDValue N2 = N->getOperand(2);
10340 EVT VT = N->getValueType(0);
10341 EVT VT0 = N0.getValueType();
10343 SDNodeFlags Flags = N->getFlags();
10345 if (SDValue V = DAG.simplifySelect(N0, N1, N2))
10348 if (SDValue V = foldSelectOfConstants(N))
10351 if (SDValue V = foldBoolSelectToLogic(N, DAG))
10354 // If we can fold this based on the true/false value, do so.
10355 if (SimplifySelectOps(N, N1, N2))
10356 return SDValue(N, 0); // Don't revisit N.
10358 if (VT0 == MVT::i1) {
10359 // The code in this block deals with the following 2 equivalences:
10360 // select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
10361 // select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
10362 // The target can specify its preferred form with the
10363 // shouldNormalizeToSelectSequence() callback. However we always transform
10364 // to the right anyway if we find the inner select exists in the DAG anyway
10365 // and we always transform to the left side if we know that we can further
10366 // optimize the combination of the conditions.
10367 bool normalizeToSequence =
10368 TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
10369 // select (and Cond0, Cond1), X, Y
10370 // -> select Cond0, (select Cond1, X, Y), Y
10371 if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
10372 SDValue Cond0 = N0->getOperand(0);
10373 SDValue Cond1 = N0->getOperand(1);
10374 SDValue InnerSelect =
10375 DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond1, N1, N2, Flags);
10376 if (normalizeToSequence || !InnerSelect.use_empty())
10377 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0,
10378 InnerSelect, N2, Flags);
10379 // Cleanup on failure.
10380 if (InnerSelect.use_empty())
10381 recursivelyDeleteUnusedNodes(InnerSelect.getNode());
10383 // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
10384 if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
10385 SDValue Cond0 = N0->getOperand(0);
10386 SDValue Cond1 = N0->getOperand(1);
10387 SDValue InnerSelect = DAG.getNode(ISD::SELECT, DL, N1.getValueType(),
10388 Cond1, N1, N2, Flags);
10389 if (normalizeToSequence || !InnerSelect.use_empty())
10390 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Cond0, N1,
10391 InnerSelect, Flags);
10392 // Cleanup on failure.
10393 if (InnerSelect.use_empty())
10394 recursivelyDeleteUnusedNodes(InnerSelect.getNode());
10397 // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
10398 if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
10399 SDValue N1_0 = N1->getOperand(0);
10400 SDValue N1_1 = N1->getOperand(1);
10401 SDValue N1_2 = N1->getOperand(2);
10402 if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
10403 // Create the actual and node if we can generate good code for it.
10404 if (!normalizeToSequence) {
10405 SDValue And = DAG.getNode(ISD::AND, DL, N0.getValueType(), N0, N1_0);
10406 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), And, N1_1,
10409 // Otherwise see if we can optimize the "and" to a better pattern.
10410 if (SDValue Combined = visitANDLike(N0, N1_0, N)) {
10411 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1_1,
10416 // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
10417 if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
10418 SDValue N2_0 = N2->getOperand(0);
10419 SDValue N2_1 = N2->getOperand(1);
10420 SDValue N2_2 = N2->getOperand(2);
10421 if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
10422 // Create the actual or node if we can generate good code for it.
10423 if (!normalizeToSequence) {
10424 SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
10425 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,
10428 // Otherwise see if we can optimize to a better pattern.
10429 if (SDValue Combined = visitORLike(N0, N2_0, N))
10430 return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Combined, N1,
10436 // select (not Cond), N1, N2 -> select Cond, N2, N1
10437 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false)) {
10438 SDValue SelectOp = DAG.getSelect(DL, VT, F, N2, N1);
10439 SelectOp->setFlags(Flags);
10443 // Fold selects based on a setcc into other things, such as min/max/abs.
10444 if (N0.getOpcode() == ISD::SETCC) {
10445 SDValue Cond0 = N0.getOperand(0), Cond1 = N0.getOperand(1);
10446 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
10448 // select (fcmp lt x, y), x, y -> fminnum x, y
10449 // select (fcmp gt x, y), x, y -> fmaxnum x, y
10451 // This is OK if we don't care what happens if either operand is a NaN.
10452 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N1, N2, TLI))
10453 if (SDValue FMinMax = combineMinNumMaxNum(DL, VT, Cond0, Cond1, N1, N2,
10457 // Use 'unsigned add with overflow' to optimize an unsigned saturating add.
10458 // This is conservatively limited to pre-legal-operations to give targets
10459 // a chance to reverse the transform if they want to do that. Also, it is
10460 // unlikely that the pattern would be formed late, so it's probably not
10461 // worth going through the other checks.
10462 if (!LegalOperations && TLI.isOperationLegalOrCustom(ISD::UADDO, VT) &&
10463 CC == ISD::SETUGT && N0.hasOneUse() && isAllOnesConstant(N1) &&
10464 N2.getOpcode() == ISD::ADD && Cond0 == N2.getOperand(0)) {
10465 auto *C = dyn_cast<ConstantSDNode>(N2.getOperand(1));
10466 auto *NotC = dyn_cast<ConstantSDNode>(Cond1);
10467 if (C && NotC && C->getAPIntValue() == ~NotC->getAPIntValue()) {
10468 // select (setcc Cond0, ~C, ugt), -1, (add Cond0, C) -->
10469 // uaddo Cond0, C; select uaddo.1, -1, uaddo.0
10471 // The IR equivalent of this transform would have this form:
10473 // %c = icmp ugt %x, ~C
10474 // %r = select %c, -1, %a
10476 // %u = call {iN,i1} llvm.uadd.with.overflow(%x, C)
10477 // %u0 = extractvalue %u, 0
10478 // %u1 = extractvalue %u, 1
10479 // %r = select %u1, -1, %u0
10480 SDVTList VTs = DAG.getVTList(VT, VT0);
10481 SDValue UAO = DAG.getNode(ISD::UADDO, DL, VTs, Cond0, N2.getOperand(1));
10482 return DAG.getSelect(DL, VT, UAO.getValue(1), N1, UAO.getValue(0));
10486 if (TLI.isOperationLegal(ISD::SELECT_CC, VT) ||
10487 (!LegalOperations &&
10488 TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT))) {
10489 // Any flags available in a select/setcc fold will be on the setcc as they
10490 // migrated from fcmp
10491 Flags = N0->getFlags();
10492 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, VT, Cond0, Cond1, N1,
10493 N2, N0.getOperand(2));
10494 SelectNode->setFlags(Flags);
10498 if (SDValue NewSel = SimplifySelect(DL, N0, N1, N2))
10502 if (!VT.isVector())
10503 if (SDValue BinOp = foldSelectOfBinops(N))
10509 // This function assumes all the vselect's arguments are CONCAT_VECTOR
10510 // nodes and that the condition is a BV of ConstantSDNodes (or undefs).
10511 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
10513 SDValue Cond = N->getOperand(0);
10514 SDValue LHS = N->getOperand(1);
10515 SDValue RHS = N->getOperand(2);
10516 EVT VT = N->getValueType(0);
10517 int NumElems = VT.getVectorNumElements();
10518 assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
10519 RHS.getOpcode() == ISD::CONCAT_VECTORS &&
10520 Cond.getOpcode() == ISD::BUILD_VECTOR);
10522 // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
10523 // binary ones here.
10524 if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
10527 // We're sure we have an even number of elements due to the
10528 // concat_vectors we have as arguments to vselect.
10529 // Skip BV elements until we find one that's not an UNDEF
10530 // After we find an UNDEF element, keep looping until we get to half the
10531 // length of the BV and see if all the non-undef nodes are the same.
10532 ConstantSDNode *BottomHalf = nullptr;
10533 for (int i = 0; i < NumElems / 2; ++i) {
10534 if (Cond->getOperand(i)->isUndef())
10537 if (BottomHalf == nullptr)
10538 BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
10539 else if (Cond->getOperand(i).getNode() != BottomHalf)
10543 // Do the same for the second half of the BuildVector
10544 ConstantSDNode *TopHalf = nullptr;
10545 for (int i = NumElems / 2; i < NumElems; ++i) {
10546 if (Cond->getOperand(i)->isUndef())
10549 if (TopHalf == nullptr)
10550 TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
10551 else if (Cond->getOperand(i).getNode() != TopHalf)
10555 assert(TopHalf && BottomHalf &&
10556 "One half of the selector was all UNDEFs and the other was all the "
10557 "same value. This should have been addressed before this function.");
10558 return DAG.getNode(
10559 ISD::CONCAT_VECTORS, DL, VT,
10560 BottomHalf->isZero() ? RHS->getOperand(0) : LHS->getOperand(0),
10561 TopHalf->isZero() ? RHS->getOperand(1) : LHS->getOperand(1));
10564 bool refineUniformBase(SDValue &BasePtr, SDValue &Index, bool IndexIsScaled,
10565 SelectionDAG &DAG) {
10566 if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD)
10569 // Only perform the transformation when existing operands can be reused.
10573 // For now we check only the LHS of the add.
10574 SDValue LHS = Index.getOperand(0);
10575 SDValue SplatVal = DAG.getSplatValue(LHS);
10576 if (!SplatVal || SplatVal.getValueType() != BasePtr.getValueType())
10579 BasePtr = SplatVal;
10580 Index = Index.getOperand(1);
10584 // Fold sext/zext of index into index type.
10585 bool refineIndexType(SDValue &Index, ISD::MemIndexType &IndexType, EVT DataVT,
10586 SelectionDAG &DAG) {
10587 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
10589 // It's always safe to look through zero extends.
10590 if (Index.getOpcode() == ISD::ZERO_EXTEND) {
10591 SDValue Op = Index.getOperand(0);
10592 if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
10593 IndexType = ISD::UNSIGNED_SCALED;
10597 if (ISD::isIndexTypeSigned(IndexType)) {
10598 IndexType = ISD::UNSIGNED_SCALED;
10603 // It's only safe to look through sign extends when Index is signed.
10604 if (Index.getOpcode() == ISD::SIGN_EXTEND &&
10605 ISD::isIndexTypeSigned(IndexType)) {
10606 SDValue Op = Index.getOperand(0);
10607 if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType(), DataVT)) {
10616 SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
10617 MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
10618 SDValue Mask = MSC->getMask();
10619 SDValue Chain = MSC->getChain();
10620 SDValue Index = MSC->getIndex();
10621 SDValue Scale = MSC->getScale();
10622 SDValue StoreVal = MSC->getValue();
10623 SDValue BasePtr = MSC->getBasePtr();
10624 ISD::MemIndexType IndexType = MSC->getIndexType();
10627 // Zap scatters with a zero mask.
10628 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10631 if (refineUniformBase(BasePtr, Index, MSC->isIndexScaled(), DAG)) {
10632 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
10633 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
10634 DL, Ops, MSC->getMemOperand(), IndexType,
10635 MSC->isTruncatingStore());
10638 if (refineIndexType(Index, IndexType, StoreVal.getValueType(), DAG)) {
10639 SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
10640 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), MSC->getMemoryVT(),
10641 DL, Ops, MSC->getMemOperand(), IndexType,
10642 MSC->isTruncatingStore());
10648 SDValue DAGCombiner::visitMSTORE(SDNode *N) {
10649 MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
10650 SDValue Mask = MST->getMask();
10651 SDValue Chain = MST->getChain();
10652 SDValue Value = MST->getValue();
10653 SDValue Ptr = MST->getBasePtr();
10656 // Zap masked stores with a zero mask.
10657 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10660 // If this is a masked load with an all ones mask, we can use a unmasked load.
10661 // FIXME: Can we do this for indexed, compressing, or truncating stores?
10662 if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MST->isUnindexed() &&
10663 !MST->isCompressingStore() && !MST->isTruncatingStore())
10664 return DAG.getStore(MST->getChain(), SDLoc(N), MST->getValue(),
10665 MST->getBasePtr(), MST->getPointerInfo(),
10666 MST->getOriginalAlign(), MachineMemOperand::MOStore,
10669 // Try transforming N to an indexed store.
10670 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
10671 return SDValue(N, 0);
10673 if (MST->isTruncatingStore() && MST->isUnindexed() &&
10674 Value.getValueType().isInteger() &&
10675 (!isa<ConstantSDNode>(Value) ||
10676 !cast<ConstantSDNode>(Value)->isOpaque())) {
10677 APInt TruncDemandedBits =
10678 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
10679 MST->getMemoryVT().getScalarSizeInBits());
10681 // See if we can simplify the operation with
10682 // SimplifyDemandedBits, which only works if the value has a single use.
10683 if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
10684 // Re-visit the store if anything changed and the store hasn't been merged
10685 // with another node (N is deleted) SimplifyDemandedBits will add Value's
10686 // node back to the worklist if necessary, but we also need to re-visit
10687 // the Store node itself.
10688 if (N->getOpcode() != ISD::DELETED_NODE)
10690 return SDValue(N, 0);
10694 // If this is a TRUNC followed by a masked store, fold this into a masked
10695 // truncating store. We can do this even if this is already a masked
10697 if ((Value.getOpcode() == ISD::TRUNCATE) && Value->hasOneUse() &&
10698 MST->isUnindexed() &&
10699 TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
10700 MST->getMemoryVT(), LegalOperations)) {
10701 auto Mask = TLI.promoteTargetBoolean(DAG, MST->getMask(),
10702 Value.getOperand(0).getValueType());
10703 return DAG.getMaskedStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
10704 MST->getOffset(), Mask, MST->getMemoryVT(),
10705 MST->getMemOperand(), MST->getAddressingMode(),
10706 /*IsTruncating=*/true);
10712 SDValue DAGCombiner::visitMGATHER(SDNode *N) {
10713 MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
10714 SDValue Mask = MGT->getMask();
10715 SDValue Chain = MGT->getChain();
10716 SDValue Index = MGT->getIndex();
10717 SDValue Scale = MGT->getScale();
10718 SDValue PassThru = MGT->getPassThru();
10719 SDValue BasePtr = MGT->getBasePtr();
10720 ISD::MemIndexType IndexType = MGT->getIndexType();
10723 // Zap gathers with a zero mask.
10724 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10725 return CombineTo(N, PassThru, MGT->getChain());
10727 if (refineUniformBase(BasePtr, Index, MGT->isIndexScaled(), DAG)) {
10728 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
10729 return DAG.getMaskedGather(
10730 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
10731 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
10734 if (refineIndexType(Index, IndexType, N->getValueType(0), DAG)) {
10735 SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
10736 return DAG.getMaskedGather(
10737 DAG.getVTList(N->getValueType(0), MVT::Other), MGT->getMemoryVT(), DL,
10738 Ops, MGT->getMemOperand(), IndexType, MGT->getExtensionType());
10744 SDValue DAGCombiner::visitMLOAD(SDNode *N) {
10745 MaskedLoadSDNode *MLD = cast<MaskedLoadSDNode>(N);
10746 SDValue Mask = MLD->getMask();
10749 // Zap masked loads with a zero mask.
10750 if (ISD::isConstantSplatVectorAllZeros(Mask.getNode()))
10751 return CombineTo(N, MLD->getPassThru(), MLD->getChain());
10753 // If this is a masked load with an all ones mask, we can use a unmasked load.
10754 // FIXME: Can we do this for indexed, expanding, or extending loads?
10755 if (ISD::isConstantSplatVectorAllOnes(Mask.getNode()) && MLD->isUnindexed() &&
10756 !MLD->isExpandingLoad() && MLD->getExtensionType() == ISD::NON_EXTLOAD) {
10757 SDValue NewLd = DAG.getLoad(
10758 N->getValueType(0), SDLoc(N), MLD->getChain(), MLD->getBasePtr(),
10759 MLD->getPointerInfo(), MLD->getOriginalAlign(),
10760 MachineMemOperand::MOLoad, MLD->getAAInfo(), MLD->getRanges());
10761 return CombineTo(N, NewLd, NewLd.getValue(1));
10764 // Try transforming N to an indexed load.
10765 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
10766 return SDValue(N, 0);
10771 /// A vector select of 2 constant vectors can be simplified to math/logic to
10772 /// avoid a variable select instruction and possibly avoid constant loads.
10773 SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
10774 SDValue Cond = N->getOperand(0);
10775 SDValue N1 = N->getOperand(1);
10776 SDValue N2 = N->getOperand(2);
10777 EVT VT = N->getValueType(0);
10778 if (!Cond.hasOneUse() || Cond.getScalarValueSizeInBits() != 1 ||
10779 !TLI.convertSelectOfConstantsToMath(VT) ||
10780 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()) ||
10781 !ISD::isBuildVectorOfConstantSDNodes(N2.getNode()))
10784 // Check if we can use the condition value to increment/decrement a single
10785 // constant value. This simplifies a select to an add and removes a constant
10786 // load/materialization from the general case.
10787 bool AllAddOne = true;
10788 bool AllSubOne = true;
10789 unsigned Elts = VT.getVectorNumElements();
10790 for (unsigned i = 0; i != Elts; ++i) {
10791 SDValue N1Elt = N1.getOperand(i);
10792 SDValue N2Elt = N2.getOperand(i);
10793 if (N1Elt.isUndef() || N2Elt.isUndef())
10795 if (N1Elt.getValueType() != N2Elt.getValueType())
10798 const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
10799 const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
10806 // Further simplifications for the extra-special cases where the constants are
10807 // all 0 or all -1 should be implemented as folds of these patterns.
10809 if (AllAddOne || AllSubOne) {
10810 // vselect <N x i1> Cond, C+1, C --> add (zext Cond), C
10811 // vselect <N x i1> Cond, C-1, C --> add (sext Cond), C
10812 auto ExtendOpcode = AllAddOne ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
10813 SDValue ExtendedCond = DAG.getNode(ExtendOpcode, DL, VT, Cond);
10814 return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
10817 // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C)
10819 if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() &&
10820 isNullOrNullSplat(N2)) {
10821 SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT);
10822 SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT);
10823 return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
10826 if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
10829 // The general case for select-of-constants:
10830 // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
10831 // ...but that only makes sense if a vselect is slower than 2 logic ops, so
10832 // leave that to a machine-specific pass.
10836 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
10837 SDValue N0 = N->getOperand(0);
10838 SDValue N1 = N->getOperand(1);
10839 SDValue N2 = N->getOperand(2);
10840 EVT VT = N->getValueType(0);
10843 if (SDValue V = DAG.simplifySelect(N0, N1, N2))
10846 if (SDValue V = foldBoolSelectToLogic(N, DAG))
10849 // vselect (not Cond), N1, N2 -> vselect Cond, N2, N1
10850 if (SDValue F = extractBooleanFlip(N0, DAG, TLI, false))
10851 return DAG.getSelect(DL, VT, F, N2, N1);
10853 // Canonicalize integer abs.
10854 // vselect (setg[te] X, 0), X, -X ->
10855 // vselect (setgt X, -1), X, -X ->
10856 // vselect (setl[te] X, 0), -X, X ->
10857 // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
10858 if (N0.getOpcode() == ISD::SETCC) {
10859 SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
10860 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
10861 bool isAbs = false;
10862 bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
10864 if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
10865 (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
10866 N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
10867 isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
10868 else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) &&
10869 N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
10870 isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
10873 if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
10874 return DAG.getNode(ISD::ABS, DL, VT, LHS);
10876 SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
10877 DAG.getConstant(VT.getScalarSizeInBits() - 1,
10878 DL, getShiftAmountTy(VT)));
10879 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
10880 AddToWorklist(Shift.getNode());
10881 AddToWorklist(Add.getNode());
10882 return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
10885 // vselect x, y (fcmp lt x, y) -> fminnum x, y
10886 // vselect x, y (fcmp gt x, y) -> fmaxnum x, y
10888 // This is OK if we don't care about what happens if either operand is a
10891 if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
10892 if (SDValue FMinMax =
10893 combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG))
10897 if (SDValue S = PerformMinMaxFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
10899 if (SDValue S = PerformUMinFpToSatCombine(LHS, RHS, N1, N2, CC, DAG))
10902 // If this select has a condition (setcc) with narrower operands than the
10903 // select, try to widen the compare to match the select width.
10904 // TODO: This should be extended to handle any constant.
10905 // TODO: This could be extended to handle non-loading patterns, but that
10906 // requires thorough testing to avoid regressions.
10907 if (isNullOrNullSplat(RHS)) {
10908 EVT NarrowVT = LHS.getValueType();
10909 EVT WideVT = N1.getValueType().changeVectorElementTypeToInteger();
10910 EVT SetCCVT = getSetCCResultType(LHS.getValueType());
10911 unsigned SetCCWidth = SetCCVT.getScalarSizeInBits();
10912 unsigned WideWidth = WideVT.getScalarSizeInBits();
10913 bool IsSigned = isSignedIntSetCC(CC);
10914 auto LoadExtOpcode = IsSigned ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
10915 if (LHS.getOpcode() == ISD::LOAD && LHS.hasOneUse() &&
10916 SetCCWidth != 1 && SetCCWidth < WideWidth &&
10917 TLI.isLoadExtLegalOrCustom(LoadExtOpcode, WideVT, NarrowVT) &&
10918 TLI.isOperationLegalOrCustom(ISD::SETCC, WideVT)) {
10919 // Both compare operands can be widened for free. The LHS can use an
10920 // extended load, and the RHS is a constant:
10921 // vselect (ext (setcc load(X), C)), N1, N2 -->
10922 // vselect (setcc extload(X), C'), N1, N2
10923 auto ExtOpcode = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
10924 SDValue WideLHS = DAG.getNode(ExtOpcode, DL, WideVT, LHS);
10925 SDValue WideRHS = DAG.getNode(ExtOpcode, DL, WideVT, RHS);
10926 EVT WideSetCCVT = getSetCCResultType(WideVT);
10927 SDValue WideSetCC = DAG.getSetCC(DL, WideSetCCVT, WideLHS, WideRHS, CC);
10928 return DAG.getSelect(DL, N1.getValueType(), WideSetCC, N1, N2);
10932 // Match VSELECTs into add with unsigned saturation.
10933 if (hasOperation(ISD::UADDSAT, VT)) {
10934 // Check if one of the arms of the VSELECT is vector with all bits set.
10935 // If it's on the left side invert the predicate to simplify logic below.
10937 ISD::CondCode SatCC = CC;
10938 if (ISD::isConstantSplatVectorAllOnes(N1.getNode())) {
10940 SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
10941 } else if (ISD::isConstantSplatVectorAllOnes(N2.getNode())) {
10945 if (Other && Other.getOpcode() == ISD::ADD) {
10946 SDValue CondLHS = LHS, CondRHS = RHS;
10947 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
10949 // Canonicalize condition operands.
10950 if (SatCC == ISD::SETUGE) {
10951 std::swap(CondLHS, CondRHS);
10952 SatCC = ISD::SETULE;
10955 // We can test against either of the addition operands.
10956 // x <= x+y ? x+y : ~0 --> uaddsat x, y
10957 // x+y >= x ? x+y : ~0 --> uaddsat x, y
10958 if (SatCC == ISD::SETULE && Other == CondRHS &&
10959 (OpLHS == CondLHS || OpRHS == CondLHS))
10960 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
10962 if (OpRHS.getOpcode() == CondRHS.getOpcode() &&
10963 (OpRHS.getOpcode() == ISD::BUILD_VECTOR ||
10964 OpRHS.getOpcode() == ISD::SPLAT_VECTOR) &&
10965 CondLHS == OpLHS) {
10966 // If the RHS is a constant we have to reverse the const
10967 // canonicalization.
10968 // x >= ~C ? x+C : ~0 --> uaddsat x, C
10969 auto MatchUADDSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
10970 return Cond->getAPIntValue() == ~Op->getAPIntValue();
10972 if (SatCC == ISD::SETULE &&
10973 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUADDSAT))
10974 return DAG.getNode(ISD::UADDSAT, DL, VT, OpLHS, OpRHS);
10979 // Match VSELECTs into sub with unsigned saturation.
10980 if (hasOperation(ISD::USUBSAT, VT)) {
10981 // Check if one of the arms of the VSELECT is a zero vector. If it's on
10982 // the left side invert the predicate to simplify logic below.
10984 ISD::CondCode SatCC = CC;
10985 if (ISD::isConstantSplatVectorAllZeros(N1.getNode())) {
10987 SatCC = ISD::getSetCCInverse(SatCC, VT.getScalarType());
10988 } else if (ISD::isConstantSplatVectorAllZeros(N2.getNode())) {
10992 // zext(x) >= y ? trunc(zext(x) - y) : 0
10993 // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
10994 // zext(x) > y ? trunc(zext(x) - y) : 0
10995 // --> usubsat(trunc(zext(x)),trunc(umin(y,SatLimit)))
10996 if (Other && Other.getOpcode() == ISD::TRUNCATE &&
10997 Other.getOperand(0).getOpcode() == ISD::SUB &&
10998 (SatCC == ISD::SETUGE || SatCC == ISD::SETUGT)) {
10999 SDValue OpLHS = Other.getOperand(0).getOperand(0);
11000 SDValue OpRHS = Other.getOperand(0).getOperand(1);
11001 if (LHS == OpLHS && RHS == OpRHS && LHS.getOpcode() == ISD::ZERO_EXTEND)
11002 if (SDValue R = getTruncatedUSUBSAT(VT, LHS.getValueType(), LHS, RHS,
11007 if (Other && Other.getNumOperands() == 2) {
11008 SDValue CondRHS = RHS;
11009 SDValue OpLHS = Other.getOperand(0), OpRHS = Other.getOperand(1);
11011 if (OpLHS == LHS) {
11012 // Look for a general sub with unsigned saturation first.
11013 // x >= y ? x-y : 0 --> usubsat x, y
11014 // x > y ? x-y : 0 --> usubsat x, y
11015 if ((SatCC == ISD::SETUGE || SatCC == ISD::SETUGT) &&
11016 Other.getOpcode() == ISD::SUB && OpRHS == CondRHS)
11017 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
11019 if (OpRHS.getOpcode() == ISD::BUILD_VECTOR ||
11020 OpRHS.getOpcode() == ISD::SPLAT_VECTOR) {
11021 if (CondRHS.getOpcode() == ISD::BUILD_VECTOR ||
11022 CondRHS.getOpcode() == ISD::SPLAT_VECTOR) {
11023 // If the RHS is a constant we have to reverse the const
11024 // canonicalization.
11025 // x > C-1 ? x+-C : 0 --> usubsat x, C
11026 auto MatchUSUBSAT = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
11027 return (!Op && !Cond) ||
11029 Cond->getAPIntValue() == (-Op->getAPIntValue() - 1));
11031 if (SatCC == ISD::SETUGT && Other.getOpcode() == ISD::ADD &&
11032 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchUSUBSAT,
11033 /*AllowUndefs*/ true)) {
11034 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
11035 DAG.getConstant(0, DL, VT), OpRHS);
11036 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
11039 // Another special case: If C was a sign bit, the sub has been
11040 // canonicalized into a xor.
11041 // FIXME: Would it be better to use computeKnownBits to
11042 // determine whether it's safe to decanonicalize the xor?
11043 // x s< 0 ? x^C : 0 --> usubsat x, C
11045 if (SatCC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
11046 ISD::isConstantSplatVector(OpRHS.getNode(), SplatValue) &&
11047 ISD::isConstantSplatVectorAllZeros(CondRHS.getNode()) &&
11048 SplatValue.isSignMask()) {
11049 // Note that we have to rebuild the RHS constant here to
11050 // ensure we don't rely on particular values of undef lanes.
11051 OpRHS = DAG.getConstant(SplatValue, DL, VT);
11052 return DAG.getNode(ISD::USUBSAT, DL, VT, OpLHS, OpRHS);
11061 if (SimplifySelectOps(N, N1, N2))
11062 return SDValue(N, 0); // Don't revisit N.
11064 // Fold (vselect all_ones, N1, N2) -> N1
11065 if (ISD::isConstantSplatVectorAllOnes(N0.getNode()))
11067 // Fold (vselect all_zeros, N1, N2) -> N2
11068 if (ISD::isConstantSplatVectorAllZeros(N0.getNode()))
11071 // The ConvertSelectToConcatVector function is assuming both the above
11072 // checks for (vselect (build_vector all{ones,zeros) ...) have been made
11074 if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
11075 N2.getOpcode() == ISD::CONCAT_VECTORS &&
11076 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
11077 if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
11081 if (SDValue V = foldVSelectOfConstants(N))
11084 if (hasOperation(ISD::SRA, VT))
11085 if (SDValue V = foldVSelectToSignBitSplatMask(N, DAG))
11088 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
11089 return SDValue(N, 0);
11094 SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
11095 SDValue N0 = N->getOperand(0);
11096 SDValue N1 = N->getOperand(1);
11097 SDValue N2 = N->getOperand(2);
11098 SDValue N3 = N->getOperand(3);
11099 SDValue N4 = N->getOperand(4);
11100 ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
11102 // fold select_cc lhs, rhs, x, x, cc -> x
11106 // Determine if the condition we're dealing with is constant
11107 if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
11108 CC, SDLoc(N), false)) {
11109 AddToWorklist(SCC.getNode());
11111 // cond always true -> true val
11112 // cond always false -> false val
11113 if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode()))
11114 return SCCC->isZero() ? N3 : N2;
11116 // When the condition is UNDEF, just return the first operand. This is
11117 // coherent the DAG creation, no setcc node is created in this case
11118 if (SCC->isUndef())
11121 // Fold to a simpler select_cc
11122 if (SCC.getOpcode() == ISD::SETCC) {
11123 SDValue SelectOp = DAG.getNode(
11124 ISD::SELECT_CC, SDLoc(N), N2.getValueType(), SCC.getOperand(0),
11125 SCC.getOperand(1), N2, N3, SCC.getOperand(2));
11126 SelectOp->setFlags(SCC->getFlags());
11131 // If we can fold this based on the true/false value, do so.
11132 if (SimplifySelectOps(N, N2, N3))
11133 return SDValue(N, 0); // Don't revisit N.
11135 // fold select_cc into other things, such as min/max/abs
11136 return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
11139 SDValue DAGCombiner::visitSETCC(SDNode *N) {
11140 // setcc is very commonly used as an argument to brcond. This pattern
11141 // also lend itself to numerous combines and, as a result, it is desired
11142 // we keep the argument to a brcond as a setcc as much as possible.
11144 N->hasOneUse() && N->use_begin()->getOpcode() == ISD::BRCOND;
11146 ISD::CondCode Cond = cast<CondCodeSDNode>(N->getOperand(2))->get();
11147 EVT VT = N->getValueType(0);
11149 // SETCC(FREEZE(X), CONST, Cond)
11151 // FREEZE(SETCC(X, CONST, Cond))
11152 // This is correct if FREEZE(X) has one use and SETCC(FREEZE(X), CONST, Cond)
11153 // isn't equivalent to true or false.
11154 // For example, SETCC(FREEZE(X), -128, SETULT) cannot be folded to
11155 // FREEZE(SETCC(X, -128, SETULT)) because X can be poison.
11157 // This transformation is beneficial because visitBRCOND can fold
11158 // BRCOND(FREEZE(X)) to BRCOND(X).
11160 // Conservatively optimize integer comparisons only.
11162 // Do this only when SETCC is going to be used by BRCOND.
11164 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
11165 ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
11166 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
11167 bool Updated = false;
11169 // Is 'X Cond C' always true or false?
11170 auto IsAlwaysTrueOrFalse = [](ISD::CondCode Cond, ConstantSDNode *C) {
11171 bool False = (Cond == ISD::SETULT && C->isZero()) ||
11172 (Cond == ISD::SETLT && C->isMinSignedValue()) ||
11173 (Cond == ISD::SETUGT && C->isAllOnes()) ||
11174 (Cond == ISD::SETGT && C->isMaxSignedValue());
11175 bool True = (Cond == ISD::SETULE && C->isAllOnes()) ||
11176 (Cond == ISD::SETLE && C->isMaxSignedValue()) ||
11177 (Cond == ISD::SETUGE && C->isZero()) ||
11178 (Cond == ISD::SETGE && C->isMinSignedValue());
11179 return True || False;
11182 if (N0->getOpcode() == ISD::FREEZE && N0.hasOneUse() && N1C) {
11183 if (!IsAlwaysTrueOrFalse(Cond, N1C)) {
11184 N0 = N0->getOperand(0);
11188 if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse() && N0C) {
11189 if (!IsAlwaysTrueOrFalse(ISD::getSetCCSwappedOperands(Cond),
11191 N1 = N1->getOperand(0);
11197 return DAG.getFreeze(DAG.getSetCC(SDLoc(N), VT, N0, N1, Cond));
11200 SDValue Combined = SimplifySetCC(VT, N->getOperand(0), N->getOperand(1), Cond,
11201 SDLoc(N), !PreferSetCC);
11206 // If we prefer to have a setcc, and we don't, we'll try our best to
11207 // recreate one using rebuildSetCC.
11208 if (PreferSetCC && Combined.getOpcode() != ISD::SETCC) {
11209 SDValue NewSetCC = rebuildSetCC(Combined);
11211 // We don't have anything interesting to combine to.
11212 if (NewSetCC.getNode() == N)
11222 SDValue DAGCombiner::visitSETCCCARRY(SDNode *N) {
11223 SDValue LHS = N->getOperand(0);
11224 SDValue RHS = N->getOperand(1);
11225 SDValue Carry = N->getOperand(2);
11226 SDValue Cond = N->getOperand(3);
11228 // If Carry is false, fold to a regular SETCC.
11229 if (isNullConstant(Carry))
11230 return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
11235 /// Check if N satisfies:
11236 /// N is used once.
11238 /// The load is compatible with ExtOpcode. It means
11239 /// If load has explicit zero/sign extension, ExpOpcode must have the same
11241 /// Otherwise returns true.
11242 static bool isCompatibleLoad(SDValue N, unsigned ExtOpcode) {
11243 if (!N.hasOneUse())
11246 if (!isa<LoadSDNode>(N))
11249 LoadSDNode *Load = cast<LoadSDNode>(N);
11250 ISD::LoadExtType LoadExt = Load->getExtensionType();
11251 if (LoadExt == ISD::NON_EXTLOAD || LoadExt == ISD::EXTLOAD)
11254 // Now LoadExt is either SEXTLOAD or ZEXTLOAD, ExtOpcode must have the same
11256 if ((LoadExt == ISD::SEXTLOAD && ExtOpcode != ISD::SIGN_EXTEND) ||
11257 (LoadExt == ISD::ZEXTLOAD && ExtOpcode != ISD::ZERO_EXTEND))
11264 /// (sext (select c, load x, load y)) -> (select c, sextload x, sextload y)
11265 /// (zext (select c, load x, load y)) -> (select c, zextload x, zextload y)
11266 /// (aext (select c, load x, load y)) -> (select c, extload x, extload y)
11267 /// This function is called by the DAGCombiner when visiting sext/zext/aext
11268 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
11269 static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
11270 SelectionDAG &DAG) {
11271 unsigned Opcode = N->getOpcode();
11272 SDValue N0 = N->getOperand(0);
11273 EVT VT = N->getValueType(0);
11276 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
11277 Opcode == ISD::ANY_EXTEND) &&
11278 "Expected EXTEND dag node in input!");
11280 if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) ||
11284 SDValue Op1 = N0->getOperand(1);
11285 SDValue Op2 = N0->getOperand(2);
11286 if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode))
11289 auto ExtLoadOpcode = ISD::EXTLOAD;
11290 if (Opcode == ISD::SIGN_EXTEND)
11291 ExtLoadOpcode = ISD::SEXTLOAD;
11292 else if (Opcode == ISD::ZERO_EXTEND)
11293 ExtLoadOpcode = ISD::ZEXTLOAD;
11295 LoadSDNode *Load1 = cast<LoadSDNode>(Op1);
11296 LoadSDNode *Load2 = cast<LoadSDNode>(Op2);
11297 if (!TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load1->getMemoryVT()) ||
11298 !TLI.isLoadExtLegal(ExtLoadOpcode, VT, Load2->getMemoryVT()))
11301 SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1);
11302 SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2);
11303 return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2);
11306 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
11307 /// a build_vector of constants.
11308 /// This function is called by the DAGCombiner when visiting sext/zext/aext
11309 /// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
11310 /// Vector extends are not folded if operations are legal; this is to
11311 /// avoid introducing illegal build_vector dag nodes.
11312 static SDValue tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
11313 SelectionDAG &DAG, bool LegalTypes) {
11314 unsigned Opcode = N->getOpcode();
11315 SDValue N0 = N->getOperand(0);
11316 EVT VT = N->getValueType(0);
11319 assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
11320 Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
11321 Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
11322 && "Expected EXTEND dag node in input!");
11324 // fold (sext c1) -> c1
11325 // fold (zext c1) -> c1
11326 // fold (aext c1) -> c1
11327 if (isa<ConstantSDNode>(N0))
11328 return DAG.getNode(Opcode, DL, VT, N0);
11330 // fold (sext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
11331 // fold (zext (select cond, c1, c2)) -> (select cond, zext c1, zext c2)
11332 // fold (aext (select cond, c1, c2)) -> (select cond, sext c1, sext c2)
11333 if (N0->getOpcode() == ISD::SELECT) {
11334 SDValue Op1 = N0->getOperand(1);
11335 SDValue Op2 = N0->getOperand(2);
11336 if (isa<ConstantSDNode>(Op1) && isa<ConstantSDNode>(Op2) &&
11337 (Opcode != ISD::ZERO_EXTEND || !TLI.isZExtFree(N0.getValueType(), VT))) {
11338 // For any_extend, choose sign extension of the constants to allow a
11339 // possible further transform to sign_extend_inreg.i.e.
11341 // t1: i8 = select t0, Constant:i8<-1>, Constant:i8<0>
11342 // t2: i64 = any_extend t1
11344 // t3: i64 = select t0, Constant:i64<-1>, Constant:i64<0>
11346 // t4: i64 = sign_extend_inreg t3
11347 unsigned FoldOpc = Opcode;
11348 if (FoldOpc == ISD::ANY_EXTEND)
11349 FoldOpc = ISD::SIGN_EXTEND;
11350 return DAG.getSelect(DL, VT, N0->getOperand(0),
11351 DAG.getNode(FoldOpc, DL, VT, Op1),
11352 DAG.getNode(FoldOpc, DL, VT, Op2));
11356 // fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
11357 // fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
11358 // fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
11359 EVT SVT = VT.getScalarType();
11360 if (!(VT.isVector() && (!LegalTypes || TLI.isTypeLegal(SVT)) &&
11361 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
11364 // We can fold this node into a build_vector.
11365 unsigned VTBits = SVT.getSizeInBits();
11366 unsigned EVTBits = N0->getValueType(0).getScalarSizeInBits();
11367 SmallVector<SDValue, 8> Elts;
11368 unsigned NumElts = VT.getVectorNumElements();
11370 // For zero-extensions, UNDEF elements still guarantee to have the upper
11371 // bits set to zero.
11373 Opcode == ISD::ZERO_EXTEND || Opcode == ISD::ZERO_EXTEND_VECTOR_INREG;
11375 for (unsigned i = 0; i != NumElts; ++i) {
11376 SDValue Op = N0.getOperand(i);
11377 if (Op.isUndef()) {
11378 Elts.push_back(IsZext ? DAG.getConstant(0, DL, SVT) : DAG.getUNDEF(SVT));
11383 // Get the constant value and if needed trunc it to the size of the type.
11384 // Nodes like build_vector might have constants wider than the scalar type.
11385 APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
11386 if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
11387 Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
11389 Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
11392 return DAG.getBuildVector(VT, DL, Elts);
11395 // ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
11396 // "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
11397 // transformation. Returns true if extension are possible and the above
11398 // mentioned transformation is profitable.
11399 static bool ExtendUsesToFormExtLoad(EVT VT, SDNode *N, SDValue N0,
11401 SmallVectorImpl<SDNode *> &ExtendNodes,
11402 const TargetLowering &TLI) {
11403 bool HasCopyToRegUses = false;
11404 bool isTruncFree = TLI.isTruncateFree(VT, N0.getValueType());
11405 for (SDNode::use_iterator UI = N0->use_begin(), UE = N0->use_end(); UI != UE;
11407 SDNode *User = *UI;
11410 if (UI.getUse().getResNo() != N0.getResNo())
11412 // FIXME: Only extend SETCC N, N and SETCC N, c for now.
11413 if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
11414 ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
11415 if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
11416 // Sign bits will be lost after a zext.
11419 for (unsigned i = 0; i != 2; ++i) {
11420 SDValue UseOp = User->getOperand(i);
11423 if (!isa<ConstantSDNode>(UseOp))
11428 ExtendNodes.push_back(User);
11431 // If truncates aren't free and there are users we can't
11432 // extend, it isn't worthwhile.
11435 // Remember if this value is live-out.
11436 if (User->getOpcode() == ISD::CopyToReg)
11437 HasCopyToRegUses = true;
11440 if (HasCopyToRegUses) {
11441 bool BothLiveOut = false;
11442 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
11444 SDUse &Use = UI.getUse();
11445 if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
11446 BothLiveOut = true;
11451 // Both unextended and extended values are live out. There had better be
11452 // a good reason for the transformation.
11453 return ExtendNodes.size();
11458 void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
11459 SDValue OrigLoad, SDValue ExtLoad,
11460 ISD::NodeType ExtType) {
11461 // Extend SetCC uses if necessary.
11463 for (SDNode *SetCC : SetCCs) {
11464 SmallVector<SDValue, 4> Ops;
11466 for (unsigned j = 0; j != 2; ++j) {
11467 SDValue SOp = SetCC->getOperand(j);
11468 if (SOp == OrigLoad)
11469 Ops.push_back(ExtLoad);
11471 Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
11474 Ops.push_back(SetCC->getOperand(2));
11475 CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
11479 // FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
11480 SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
11481 SDValue N0 = N->getOperand(0);
11482 EVT DstVT = N->getValueType(0);
11483 EVT SrcVT = N0.getValueType();
11485 assert((N->getOpcode() == ISD::SIGN_EXTEND ||
11486 N->getOpcode() == ISD::ZERO_EXTEND) &&
11487 "Unexpected node type (not an extend)!");
11489 // fold (sext (load x)) to multiple smaller sextloads; same for zext.
11490 // For example, on a target with legal v4i32, but illegal v8i32, turn:
11491 // (v8i32 (sext (v8i16 (load x))))
11493 // (v8i32 (concat_vectors (v4i32 (sextload x)),
11494 // (v4i32 (sextload (x + 16)))))
11495 // Where uses of the original load, i.e.:
11496 // (v8i16 (load x))
11497 // are replaced with:
11498 // (v8i16 (truncate
11499 // (v8i32 (concat_vectors (v4i32 (sextload x)),
11500 // (v4i32 (sextload (x + 16)))))))
11502 // This combine is only applicable to illegal, but splittable, vectors.
11503 // All legal types, and illegal non-vector types, are handled elsewhere.
11504 // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
11506 if (N0->getOpcode() != ISD::LOAD)
11509 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
11511 if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
11512 !N0.hasOneUse() || !LN0->isSimple() ||
11513 !DstVT.isVector() || !DstVT.isPow2VectorType() ||
11514 !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
11517 SmallVector<SDNode *, 4> SetCCs;
11518 if (!ExtendUsesToFormExtLoad(DstVT, N, N0, N->getOpcode(), SetCCs, TLI))
11521 ISD::LoadExtType ExtType =
11522 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
11524 // Try to split the vector types to get down to legal types.
11525 EVT SplitSrcVT = SrcVT;
11526 EVT SplitDstVT = DstVT;
11527 while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
11528 SplitSrcVT.getVectorNumElements() > 1) {
11529 SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
11530 SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
11533 if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
11536 assert(!DstVT.isScalableVector() && "Unexpected scalable vector type");
11539 const unsigned NumSplits =
11540 DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
11541 const unsigned Stride = SplitSrcVT.getStoreSize();
11542 SmallVector<SDValue, 4> Loads;
11543 SmallVector<SDValue, 4> Chains;
11545 SDValue BasePtr = LN0->getBasePtr();
11546 for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
11547 const unsigned Offset = Idx * Stride;
11548 const Align Align = commonAlignment(LN0->getAlign(), Offset);
11550 SDValue SplitLoad = DAG.getExtLoad(
11551 ExtType, SDLoc(LN0), SplitDstVT, LN0->getChain(), BasePtr,
11552 LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
11553 LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
11555 BasePtr = DAG.getMemBasePlusOffset(BasePtr, TypeSize::Fixed(Stride), DL);
11557 Loads.push_back(SplitLoad.getValue(0));
11558 Chains.push_back(SplitLoad.getValue(1));
11561 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
11562 SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
11565 AddToWorklist(NewChain.getNode());
11567 CombineTo(N, NewValue);
11569 // Replace uses of the original load (before extension)
11570 // with a truncate of the concatenated sextloaded vectors.
11572 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
11573 ExtendSetCCUses(SetCCs, N0, NewValue, (ISD::NodeType)N->getOpcode());
11574 CombineTo(N0.getNode(), Trunc, NewChain);
11575 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11578 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
11579 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
11580 SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
11581 assert(N->getOpcode() == ISD::ZERO_EXTEND);
11582 EVT VT = N->getValueType(0);
11583 EVT OrigVT = N->getOperand(0).getValueType();
11584 if (TLI.isZExtFree(OrigVT, VT))
11588 SDValue N0 = N->getOperand(0);
11589 if (!(N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
11590 N0.getOpcode() == ISD::XOR) ||
11591 N0.getOperand(1).getOpcode() != ISD::Constant ||
11592 (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT)))
11596 SDValue N1 = N0->getOperand(0);
11597 if (!(N1.getOpcode() == ISD::SHL || N1.getOpcode() == ISD::SRL) ||
11598 N1.getOperand(1).getOpcode() != ISD::Constant ||
11599 (LegalOperations && !TLI.isOperationLegal(N1.getOpcode(), VT)))
11603 if (!isa<LoadSDNode>(N1.getOperand(0)))
11605 LoadSDNode *Load = cast<LoadSDNode>(N1.getOperand(0));
11606 EVT MemVT = Load->getMemoryVT();
11607 if (!TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) ||
11608 Load->getExtensionType() == ISD::SEXTLOAD || Load->isIndexed())
11612 // If the shift op is SHL, the logic op must be AND, otherwise the result
11614 if (N1.getOpcode() == ISD::SHL && N0.getOpcode() != ISD::AND)
11617 if (!N0.hasOneUse() || !N1.hasOneUse())
11620 SmallVector<SDNode*, 4> SetCCs;
11621 if (!ExtendUsesToFormExtLoad(VT, N1.getNode(), N1.getOperand(0),
11622 ISD::ZERO_EXTEND, SetCCs, TLI))
11625 // Actually do the transformation.
11626 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Load), VT,
11627 Load->getChain(), Load->getBasePtr(),
11628 Load->getMemoryVT(), Load->getMemOperand());
11631 SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
11634 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
11636 SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
11637 DAG.getConstant(Mask, DL0, VT));
11639 ExtendSetCCUses(SetCCs, N1.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
11641 if (SDValue(Load, 0).hasOneUse()) {
11642 DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), ExtLoad.getValue(1));
11644 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(Load),
11645 Load->getValueType(0), ExtLoad);
11646 CombineTo(Load, Trunc, ExtLoad.getValue(1));
11649 // N0 is dead at this point.
11650 recursivelyDeleteUnusedNodes(N0.getNode());
11652 return SDValue(N,0); // Return N so it doesn't get rechecked!
11655 /// If we're narrowing or widening the result of a vector select and the final
11656 /// size is the same size as a setcc (compare) feeding the select, then try to
11657 /// apply the cast operation to the select's operands because matching vector
11658 /// sizes for a select condition and other operands should be more efficient.
11659 SDValue DAGCombiner::matchVSelectOpSizesWithSetCC(SDNode *Cast) {
11660 unsigned CastOpcode = Cast->getOpcode();
11661 assert((CastOpcode == ISD::SIGN_EXTEND || CastOpcode == ISD::ZERO_EXTEND ||
11662 CastOpcode == ISD::TRUNCATE || CastOpcode == ISD::FP_EXTEND ||
11663 CastOpcode == ISD::FP_ROUND) &&
11664 "Unexpected opcode for vector select narrowing/widening");
11666 // We only do this transform before legal ops because the pattern may be
11667 // obfuscated by target-specific operations after legalization. Do not create
11668 // an illegal select op, however, because that may be difficult to lower.
11669 EVT VT = Cast->getValueType(0);
11670 if (LegalOperations || !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
11673 SDValue VSel = Cast->getOperand(0);
11674 if (VSel.getOpcode() != ISD::VSELECT || !VSel.hasOneUse() ||
11675 VSel.getOperand(0).getOpcode() != ISD::SETCC)
11678 // Does the setcc have the same vector size as the casted select?
11679 SDValue SetCC = VSel.getOperand(0);
11680 EVT SetCCVT = getSetCCResultType(SetCC.getOperand(0).getValueType());
11681 if (SetCCVT.getSizeInBits() != VT.getSizeInBits())
11684 // cast (vsel (setcc X), A, B) --> vsel (setcc X), (cast A), (cast B)
11685 SDValue A = VSel.getOperand(1);
11686 SDValue B = VSel.getOperand(2);
11687 SDValue CastA, CastB;
11689 if (CastOpcode == ISD::FP_ROUND) {
11690 // FP_ROUND (fptrunc) has an extra flag operand to pass along.
11691 CastA = DAG.getNode(CastOpcode, DL, VT, A, Cast->getOperand(1));
11692 CastB = DAG.getNode(CastOpcode, DL, VT, B, Cast->getOperand(1));
11694 CastA = DAG.getNode(CastOpcode, DL, VT, A);
11695 CastB = DAG.getNode(CastOpcode, DL, VT, B);
11697 return DAG.getNode(ISD::VSELECT, DL, VT, SetCC, CastA, CastB);
11700 // fold ([s|z]ext ([s|z]extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
11701 // fold ([s|z]ext ( extload x)) -> ([s|z]ext (truncate ([s|z]extload x)))
11702 static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
11703 const TargetLowering &TLI, EVT VT,
11704 bool LegalOperations, SDNode *N,
11705 SDValue N0, ISD::LoadExtType ExtLoadType) {
11706 SDNode *N0Node = N0.getNode();
11707 bool isAExtLoad = (ExtLoadType == ISD::SEXTLOAD) ? ISD::isSEXTLoad(N0Node)
11708 : ISD::isZEXTLoad(N0Node);
11709 if ((!isAExtLoad && !ISD::isEXTLoad(N0Node)) ||
11710 !ISD::isUNINDEXEDLoad(N0Node) || !N0.hasOneUse())
11713 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
11714 EVT MemVT = LN0->getMemoryVT();
11715 if ((LegalOperations || !LN0->isSimple() ||
11717 !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
11721 DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
11722 LN0->getBasePtr(), MemVT, LN0->getMemOperand());
11723 Combiner.CombineTo(N, ExtLoad);
11724 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
11725 if (LN0->use_empty())
11726 Combiner.recursivelyDeleteUnusedNodes(LN0);
11727 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11730 // fold ([s|z]ext (load x)) -> ([s|z]ext (truncate ([s|z]extload x)))
11731 // Only generate vector extloads when 1) they're legal, and 2) they are
11732 // deemed desirable by the target.
11733 static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
11734 const TargetLowering &TLI, EVT VT,
11735 bool LegalOperations, SDNode *N, SDValue N0,
11736 ISD::LoadExtType ExtLoadType,
11737 ISD::NodeType ExtOpc) {
11738 // TODO: isFixedLengthVector() should be removed and any negative effects on
11739 // code generation being the result of that target's implementation of
11740 // isVectorLoadExtDesirable().
11741 if (!ISD::isNON_EXTLoad(N0.getNode()) ||
11742 !ISD::isUNINDEXEDLoad(N0.getNode()) ||
11743 ((LegalOperations || VT.isFixedLengthVector() ||
11744 !cast<LoadSDNode>(N0)->isSimple()) &&
11745 !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
11748 bool DoXform = true;
11749 SmallVector<SDNode *, 4> SetCCs;
11750 if (!N0.hasOneUse())
11751 DoXform = ExtendUsesToFormExtLoad(VT, N, N0, ExtOpc, SetCCs, TLI);
11753 DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
11757 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
11758 SDValue ExtLoad = DAG.getExtLoad(ExtLoadType, SDLoc(LN0), VT, LN0->getChain(),
11759 LN0->getBasePtr(), N0.getValueType(),
11760 LN0->getMemOperand());
11761 Combiner.ExtendSetCCUses(SetCCs, N0, ExtLoad, ExtOpc);
11762 // If the load value is used only by N, replace it via CombineTo N.
11763 bool NoReplaceTrunc = SDValue(LN0, 0).hasOneUse();
11764 Combiner.CombineTo(N, ExtLoad);
11765 if (NoReplaceTrunc) {
11766 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
11767 Combiner.recursivelyDeleteUnusedNodes(LN0);
11770 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
11771 Combiner.CombineTo(LN0, Trunc, ExtLoad.getValue(1));
11773 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11776 static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
11777 const TargetLowering &TLI, EVT VT,
11778 SDNode *N, SDValue N0,
11779 ISD::LoadExtType ExtLoadType,
11780 ISD::NodeType ExtOpc) {
11781 if (!N0.hasOneUse())
11784 MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
11785 if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
11788 if (!TLI.isLoadExtLegalOrCustom(ExtLoadType, VT, Ld->getValueType(0)))
11791 if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
11795 SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
11796 SDValue NewLoad = DAG.getMaskedLoad(
11797 VT, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(), Ld->getMask(),
11798 PassThru, Ld->getMemoryVT(), Ld->getMemOperand(), Ld->getAddressingMode(),
11799 ExtLoadType, Ld->isExpandingLoad());
11800 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
11804 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
11805 bool LegalOperations) {
11806 assert((N->getOpcode() == ISD::SIGN_EXTEND ||
11807 N->getOpcode() == ISD::ZERO_EXTEND) && "Expected sext or zext");
11809 SDValue SetCC = N->getOperand(0);
11810 if (LegalOperations || SetCC.getOpcode() != ISD::SETCC ||
11811 !SetCC.hasOneUse() || SetCC.getValueType() != MVT::i1)
11814 SDValue X = SetCC.getOperand(0);
11815 SDValue Ones = SetCC.getOperand(1);
11816 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
11817 EVT VT = N->getValueType(0);
11818 EVT XVT = X.getValueType();
11819 // setge X, C is canonicalized to setgt, so we do not need to match that
11820 // pattern. The setlt sibling is folded in SimplifySelectCC() because it does
11821 // not require the 'not' op.
11822 if (CC == ISD::SETGT && isAllOnesConstant(Ones) && VT == XVT) {
11823 // Invert and smear/shift the sign bit:
11824 // sext i1 (setgt iN X, -1) --> sra (not X), (N - 1)
11825 // zext i1 (setgt iN X, -1) --> srl (not X), (N - 1)
11827 unsigned ShCt = VT.getSizeInBits() - 1;
11828 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
11829 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
11830 SDValue NotX = DAG.getNOT(DL, X, VT);
11831 SDValue ShiftAmount = DAG.getConstant(ShCt, DL, VT);
11833 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SRA : ISD::SRL;
11834 return DAG.getNode(ShiftOpcode, DL, VT, NotX, ShiftAmount);
11840 SDValue DAGCombiner::foldSextSetcc(SDNode *N) {
11841 SDValue N0 = N->getOperand(0);
11842 if (N0.getOpcode() != ISD::SETCC)
11845 SDValue N00 = N0.getOperand(0);
11846 SDValue N01 = N0.getOperand(1);
11847 ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
11848 EVT VT = N->getValueType(0);
11849 EVT N00VT = N00.getValueType();
11852 // Propagate fast-math-flags.
11853 SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
11855 // On some architectures (such as SSE/NEON/etc) the SETCC result type is
11856 // the same size as the compared operands. Try to optimize sext(setcc())
11857 // if this is the case.
11858 if (VT.isVector() && !LegalOperations &&
11859 TLI.getBooleanContents(N00VT) ==
11860 TargetLowering::ZeroOrNegativeOneBooleanContent) {
11861 EVT SVT = getSetCCResultType(N00VT);
11863 // If we already have the desired type, don't change it.
11864 if (SVT != N0.getValueType()) {
11865 // We know that the # elements of the results is the same as the
11866 // # elements of the compare (and the # elements of the compare result
11867 // for that matter). Check to see that they are the same size. If so,
11868 // we know that the element size of the sext'd result matches the
11869 // element size of the compare operands.
11870 if (VT.getSizeInBits() == SVT.getSizeInBits())
11871 return DAG.getSetCC(DL, VT, N00, N01, CC);
11873 // If the desired elements are smaller or larger than the source
11874 // elements, we can use a matching integer vector type and then
11875 // truncate/sign extend.
11876 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
11877 if (SVT == MatchingVecType) {
11878 SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
11879 return DAG.getSExtOrTrunc(VsetCC, DL, VT);
11883 // Try to eliminate the sext of a setcc by zexting the compare operands.
11884 if (N0.hasOneUse() && TLI.isOperationLegalOrCustom(ISD::SETCC, VT) &&
11885 !TLI.isOperationLegalOrCustom(ISD::SETCC, SVT)) {
11886 bool IsSignedCmp = ISD::isSignedIntSetCC(CC);
11887 unsigned LoadOpcode = IsSignedCmp ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
11888 unsigned ExtOpcode = IsSignedCmp ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
11890 // We have an unsupported narrow vector compare op that would be legal
11891 // if extended to the destination type. See if the compare operands
11892 // can be freely extended to the destination type.
11893 auto IsFreeToExtend = [&](SDValue V) {
11894 if (isConstantOrConstantVector(V, /*NoOpaques*/ true))
11896 // Match a simple, non-extended load that can be converted to a
11897 // legal {z/s}ext-load.
11898 // TODO: Allow widening of an existing {z/s}ext-load?
11899 if (!(ISD::isNON_EXTLoad(V.getNode()) &&
11900 ISD::isUNINDEXEDLoad(V.getNode()) &&
11901 cast<LoadSDNode>(V)->isSimple() &&
11902 TLI.isLoadExtLegal(LoadOpcode, VT, V.getValueType())))
11905 // Non-chain users of this value must either be the setcc in this
11906 // sequence or extends that can be folded into the new {z/s}ext-load.
11907 for (SDNode::use_iterator UI = V->use_begin(), UE = V->use_end();
11909 // Skip uses of the chain and the setcc.
11910 SDNode *User = *UI;
11911 if (UI.getUse().getResNo() != 0 || User == N0.getNode())
11913 // Extra users must have exactly the same cast we are about to create.
11914 // TODO: This restriction could be eased if ExtendUsesToFormExtLoad()
11915 // is enhanced similarly.
11916 if (User->getOpcode() != ExtOpcode || User->getValueType(0) != VT)
11922 if (IsFreeToExtend(N00) && IsFreeToExtend(N01)) {
11923 SDValue Ext0 = DAG.getNode(ExtOpcode, DL, VT, N00);
11924 SDValue Ext1 = DAG.getNode(ExtOpcode, DL, VT, N01);
11925 return DAG.getSetCC(DL, VT, Ext0, Ext1, CC);
11930 // sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0)
11931 // Here, T can be 1 or -1, depending on the type of the setcc and
11932 // getBooleanContents().
11933 unsigned SetCCWidth = N0.getScalarValueSizeInBits();
11935 // To determine the "true" side of the select, we need to know the high bit
11936 // of the value returned by the setcc if it evaluates to true.
11937 // If the type of the setcc is i1, then the true case of the select is just
11938 // sext(i1 1), that is, -1.
11939 // If the type of the setcc is larger (say, i8) then the value of the high
11940 // bit depends on getBooleanContents(), so ask TLI for a real "true" value
11941 // of the appropriate width.
11942 SDValue ExtTrueVal = (SetCCWidth == 1)
11943 ? DAG.getAllOnesConstant(DL, VT)
11944 : DAG.getBoolConstant(true, DL, VT, N00VT);
11945 SDValue Zero = DAG.getConstant(0, DL, VT);
11946 if (SDValue SCC = SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
11949 if (!VT.isVector() && !TLI.convertSelectOfConstantsToMath(VT)) {
11950 EVT SetCCVT = getSetCCResultType(N00VT);
11951 // Don't do this transform for i1 because there's a select transform
11952 // that would reverse it.
11953 // TODO: We should not do this transform at all without a target hook
11954 // because a sext is likely cheaper than a select?
11955 if (SetCCVT.getScalarSizeInBits() != 1 &&
11956 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
11957 SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
11958 return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
11965 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
11966 SDValue N0 = N->getOperand(0);
11967 EVT VT = N->getValueType(0);
11970 // sext(undef) = 0 because the top bit will all be the same.
11972 return DAG.getConstant(0, DL, VT);
11974 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
11977 // fold (sext (sext x)) -> (sext x)
11978 // fold (sext (aext x)) -> (sext x)
11979 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
11980 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
11982 if (N0.getOpcode() == ISD::TRUNCATE) {
11983 // fold (sext (truncate (load x))) -> (sext (smaller load x))
11984 // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
11985 if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
11986 SDNode *oye = N0.getOperand(0).getNode();
11987 if (NarrowLoad.getNode() != N0.getNode()) {
11988 CombineTo(N0.getNode(), NarrowLoad);
11989 // CombineTo deleted the truncate, if needed, but not what's under it.
11990 AddToWorklist(oye);
11992 return SDValue(N, 0); // Return N so it doesn't get rechecked!
11995 // See if the value being truncated is already sign extended. If so, just
11996 // eliminate the trunc/sext pair.
11997 SDValue Op = N0.getOperand(0);
11998 unsigned OpBits = Op.getScalarValueSizeInBits();
11999 unsigned MidBits = N0.getScalarValueSizeInBits();
12000 unsigned DestBits = VT.getScalarSizeInBits();
12001 unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
12003 if (OpBits == DestBits) {
12004 // Op is i32, Mid is i8, and Dest is i32. If Op has more than 24 sign
12005 // bits, it is already ready.
12006 if (NumSignBits > DestBits-MidBits)
12008 } else if (OpBits < DestBits) {
12009 // Op is i32, Mid is i8, and Dest is i64. If Op has more than 24 sign
12010 // bits, just sext from i32.
12011 if (NumSignBits > OpBits-MidBits)
12012 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
12014 // Op is i64, Mid is i8, and Dest is i32. If Op has more than 56 sign
12015 // bits, just truncate to i32.
12016 if (NumSignBits > OpBits-MidBits)
12017 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
12020 // fold (sext (truncate x)) -> (sextinreg x).
12021 if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
12022 N0.getValueType())) {
12023 if (OpBits < DestBits)
12024 Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
12025 else if (OpBits > DestBits)
12026 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
12027 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
12028 DAG.getValueType(N0.getValueType()));
12032 // Try to simplify (sext (load x)).
12033 if (SDValue foldedExt =
12034 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
12035 ISD::SEXTLOAD, ISD::SIGN_EXTEND))
12038 if (SDValue foldedExt =
12039 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
12043 // fold (sext (load x)) to multiple smaller sextloads.
12044 // Only on illegal but splittable vectors.
12045 if (SDValue ExtLoad = CombineExtLoad(N))
12048 // Try to simplify (sext (sextload x)).
12049 if (SDValue foldedExt = tryToFoldExtOfExtload(
12050 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::SEXTLOAD))
12053 // fold (sext (and/or/xor (load x), cst)) ->
12054 // (and/or/xor (sextload x), (sext cst))
12055 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
12056 N0.getOpcode() == ISD::XOR) &&
12057 isa<LoadSDNode>(N0.getOperand(0)) &&
12058 N0.getOperand(1).getOpcode() == ISD::Constant &&
12059 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
12060 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
12061 EVT MemVT = LN00->getMemoryVT();
12062 if (TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT) &&
12063 LN00->getExtensionType() != ISD::ZEXTLOAD && LN00->isUnindexed()) {
12064 SmallVector<SDNode*, 4> SetCCs;
12065 bool DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
12066 ISD::SIGN_EXTEND, SetCCs, TLI);
12068 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN00), VT,
12069 LN00->getChain(), LN00->getBasePtr(),
12070 LN00->getMemoryVT(),
12071 LN00->getMemOperand());
12072 APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits());
12073 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
12074 ExtLoad, DAG.getConstant(Mask, DL, VT));
12075 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
12076 bool NoReplaceTruncAnd = !N0.hasOneUse();
12077 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
12079 // If N0 has multiple uses, change other uses as well.
12080 if (NoReplaceTruncAnd) {
12082 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
12083 CombineTo(N0.getNode(), TruncAnd);
12085 if (NoReplaceTrunc) {
12086 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
12088 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
12089 LN00->getValueType(0), ExtLoad);
12090 CombineTo(LN00, Trunc, ExtLoad.getValue(1));
12092 return SDValue(N,0); // Return N so it doesn't get rechecked!
12097 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
12100 if (SDValue V = foldSextSetcc(N))
12103 // fold (sext x) -> (zext x) if the sign bit is known zero.
12104 if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
12105 DAG.SignBitIsZero(N0))
12106 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
12108 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
12111 // Eliminate this sign extend by doing a negation in the destination type:
12112 // sext i32 (0 - (zext i8 X to i32)) to i64 --> 0 - (zext i8 X to i64)
12113 if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() &&
12114 isNullOrNullSplat(N0.getOperand(0)) &&
12115 N0.getOperand(1).getOpcode() == ISD::ZERO_EXTEND &&
12116 TLI.isOperationLegalOrCustom(ISD::SUB, VT)) {
12117 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(1).getOperand(0), DL, VT);
12118 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Zext);
12120 // Eliminate this sign extend by doing a decrement in the destination type:
12121 // sext i32 ((zext i8 X to i32) + (-1)) to i64 --> (zext i8 X to i64) + (-1)
12122 if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() &&
12123 isAllOnesOrAllOnesSplat(N0.getOperand(1)) &&
12124 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
12125 TLI.isOperationLegalOrCustom(ISD::ADD, VT)) {
12126 SDValue Zext = DAG.getZExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
12127 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
12130 // fold sext (not i1 X) -> add (zext i1 X), -1
12131 // TODO: This could be extended to handle bool vectors.
12132 if (N0.getValueType() == MVT::i1 && isBitwiseNot(N0) && N0.hasOneUse() &&
12133 (!LegalOperations || (TLI.isOperationLegal(ISD::ZERO_EXTEND, VT) &&
12134 TLI.isOperationLegal(ISD::ADD, VT)))) {
12135 // If we can eliminate the 'not', the sext form should be better
12136 if (SDValue NewXor = visitXOR(N0.getNode())) {
12137 // Returning N0 is a form of in-visit replacement that may have
12139 if (NewXor.getNode() == N0.getNode()) {
12140 // Return SDValue here as the xor should have already been replaced in
12145 // Return a new sext with the new xor.
12146 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NewXor);
12149 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
12150 return DAG.getNode(ISD::ADD, DL, VT, Zext, DAG.getAllOnesConstant(DL, VT));
12153 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
12159 // isTruncateOf - If N is a truncate of some other value, return true, record
12160 // the value being truncated in Op and which of Op's bits are zero/one in Known.
12161 // This function computes KnownBits to avoid a duplicated call to
12162 // computeKnownBits in the caller.
12163 static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
12164 KnownBits &Known) {
12165 if (N->getOpcode() == ISD::TRUNCATE) {
12166 Op = N->getOperand(0);
12167 Known = DAG.computeKnownBits(Op);
12171 if (N.getOpcode() != ISD::SETCC ||
12172 N.getValueType().getScalarType() != MVT::i1 ||
12173 cast<CondCodeSDNode>(N.getOperand(2))->get() != ISD::SETNE)
12176 SDValue Op0 = N->getOperand(0);
12177 SDValue Op1 = N->getOperand(1);
12178 assert(Op0.getValueType() == Op1.getValueType());
12180 if (isNullOrNullSplat(Op0))
12182 else if (isNullOrNullSplat(Op1))
12187 Known = DAG.computeKnownBits(Op);
12189 return (Known.Zero | 1).isAllOnes();
12192 /// Given an extending node with a pop-count operand, if the target does not
12193 /// support a pop-count in the narrow source type but does support it in the
12194 /// destination type, widen the pop-count to the destination type.
12195 static SDValue widenCtPop(SDNode *Extend, SelectionDAG &DAG) {
12196 assert((Extend->getOpcode() == ISD::ZERO_EXTEND ||
12197 Extend->getOpcode() == ISD::ANY_EXTEND) && "Expected extend op");
12199 SDValue CtPop = Extend->getOperand(0);
12200 if (CtPop.getOpcode() != ISD::CTPOP || !CtPop.hasOneUse())
12203 EVT VT = Extend->getValueType(0);
12204 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12205 if (TLI.isOperationLegalOrCustom(ISD::CTPOP, CtPop.getValueType()) ||
12206 !TLI.isOperationLegalOrCustom(ISD::CTPOP, VT))
12209 // zext (ctpop X) --> ctpop (zext X)
12211 SDValue NewZext = DAG.getZExtOrTrunc(CtPop.getOperand(0), DL, VT);
12212 return DAG.getNode(ISD::CTPOP, DL, VT, NewZext);
12215 SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
12216 SDValue N0 = N->getOperand(0);
12217 EVT VT = N->getValueType(0);
12221 return DAG.getConstant(0, SDLoc(N), VT);
12223 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
12226 // fold (zext (zext x)) -> (zext x)
12227 // fold (zext (aext x)) -> (zext x)
12228 if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
12229 return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
12232 // fold (zext (truncate x)) -> (zext x) or
12233 // (zext (truncate x)) -> (truncate x)
12234 // This is valid when the truncated bits of x are already zero.
12237 if (isTruncateOf(DAG, N0, Op, Known)) {
12238 APInt TruncatedBits =
12239 (Op.getScalarValueSizeInBits() == N0.getScalarValueSizeInBits()) ?
12240 APInt(Op.getScalarValueSizeInBits(), 0) :
12241 APInt::getBitsSet(Op.getScalarValueSizeInBits(),
12242 N0.getScalarValueSizeInBits(),
12243 std::min(Op.getScalarValueSizeInBits(),
12244 VT.getScalarSizeInBits()));
12245 if (TruncatedBits.isSubsetOf(Known.Zero))
12246 return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
12249 // fold (zext (truncate x)) -> (and x, mask)
12250 if (N0.getOpcode() == ISD::TRUNCATE) {
12251 // fold (zext (truncate (load x))) -> (zext (smaller load x))
12252 // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
12253 if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
12254 SDNode *oye = N0.getOperand(0).getNode();
12255 if (NarrowLoad.getNode() != N0.getNode()) {
12256 CombineTo(N0.getNode(), NarrowLoad);
12257 // CombineTo deleted the truncate, if needed, but not what's under it.
12258 AddToWorklist(oye);
12260 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12263 EVT SrcVT = N0.getOperand(0).getValueType();
12264 EVT MinVT = N0.getValueType();
12266 // Try to mask before the extension to avoid having to generate a larger mask,
12267 // possibly over several sub-vectors.
12268 if (SrcVT.bitsLT(VT) && VT.isVector()) {
12269 if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
12270 TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
12271 SDValue Op = N0.getOperand(0);
12272 Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
12273 AddToWorklist(Op.getNode());
12274 SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
12275 // Transfer the debug info; the new node is equivalent to N0.
12276 DAG.transferDbgValues(N0, ZExtOrTrunc);
12277 return ZExtOrTrunc;
12281 if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
12282 SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
12283 AddToWorklist(Op.getNode());
12284 SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
12285 // We may safely transfer the debug info describing the truncate node over
12286 // to the equivalent and operation.
12287 DAG.transferDbgValues(N0, And);
12292 // Fold (zext (and (trunc x), cst)) -> (and x, cst),
12293 // if either of the casts is not free.
12294 if (N0.getOpcode() == ISD::AND &&
12295 N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
12296 N0.getOperand(1).getOpcode() == ISD::Constant &&
12297 (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
12298 N0.getValueType()) ||
12299 !TLI.isZExtFree(N0.getValueType(), VT))) {
12300 SDValue X = N0.getOperand(0).getOperand(0);
12301 X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
12302 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
12304 return DAG.getNode(ISD::AND, DL, VT,
12305 X, DAG.getConstant(Mask, DL, VT));
12308 // Try to simplify (zext (load x)).
12309 if (SDValue foldedExt =
12310 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
12311 ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
12314 if (SDValue foldedExt =
12315 tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
12319 // fold (zext (load x)) to multiple smaller zextloads.
12320 // Only on illegal but splittable vectors.
12321 if (SDValue ExtLoad = CombineExtLoad(N))
12324 // fold (zext (and/or/xor (load x), cst)) ->
12325 // (and/or/xor (zextload x), (zext cst))
12326 // Unless (and (load x) cst) will match as a zextload already and has
12327 // additional users.
12328 if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
12329 N0.getOpcode() == ISD::XOR) &&
12330 isa<LoadSDNode>(N0.getOperand(0)) &&
12331 N0.getOperand(1).getOpcode() == ISD::Constant &&
12332 (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
12333 LoadSDNode *LN00 = cast<LoadSDNode>(N0.getOperand(0));
12334 EVT MemVT = LN00->getMemoryVT();
12335 if (TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT) &&
12336 LN00->getExtensionType() != ISD::SEXTLOAD && LN00->isUnindexed()) {
12337 bool DoXform = true;
12338 SmallVector<SDNode*, 4> SetCCs;
12339 if (!N0.hasOneUse()) {
12340 if (N0.getOpcode() == ISD::AND) {
12341 auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
12342 EVT LoadResultTy = AndC->getValueType(0);
12344 if (isAndLoadExtLoad(AndC, LN00, LoadResultTy, ExtVT))
12349 DoXform = ExtendUsesToFormExtLoad(VT, N0.getNode(), N0.getOperand(0),
12350 ISD::ZERO_EXTEND, SetCCs, TLI);
12352 SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN00), VT,
12353 LN00->getChain(), LN00->getBasePtr(),
12354 LN00->getMemoryVT(),
12355 LN00->getMemOperand());
12356 APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
12358 SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
12359 ExtLoad, DAG.getConstant(Mask, DL, VT));
12360 ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::ZERO_EXTEND);
12361 bool NoReplaceTruncAnd = !N0.hasOneUse();
12362 bool NoReplaceTrunc = SDValue(LN00, 0).hasOneUse();
12364 // If N0 has multiple uses, change other uses as well.
12365 if (NoReplaceTruncAnd) {
12367 DAG.getNode(ISD::TRUNCATE, DL, N0.getValueType(), And);
12368 CombineTo(N0.getNode(), TruncAnd);
12370 if (NoReplaceTrunc) {
12371 DAG.ReplaceAllUsesOfValueWith(SDValue(LN00, 1), ExtLoad.getValue(1));
12373 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(LN00),
12374 LN00->getValueType(0), ExtLoad);
12375 CombineTo(LN00, Trunc, ExtLoad.getValue(1));
12377 return SDValue(N,0); // Return N so it doesn't get rechecked!
12382 // fold (zext (and/or/xor (shl/shr (load x), cst), cst)) ->
12383 // (and/or/xor (shl/shr (zextload x), (zext cst)), (zext cst))
12384 if (SDValue ZExtLoad = CombineZExtLogicopShiftLoad(N))
12387 // Try to simplify (zext (zextload x)).
12388 if (SDValue foldedExt = tryToFoldExtOfExtload(
12389 DAG, *this, TLI, VT, LegalOperations, N, N0, ISD::ZEXTLOAD))
12392 if (SDValue V = foldExtendedSignBitTest(N, DAG, LegalOperations))
12395 if (N0.getOpcode() == ISD::SETCC) {
12396 // Propagate fast-math-flags.
12397 SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
12399 // Only do this before legalize for now.
12400 if (!LegalOperations && VT.isVector() &&
12401 N0.getValueType().getVectorElementType() == MVT::i1) {
12402 EVT N00VT = N0.getOperand(0).getValueType();
12403 if (getSetCCResultType(N00VT) == N0.getValueType())
12406 // We know that the # elements of the results is the same as the #
12407 // elements of the compare (and the # elements of the compare result for
12408 // that matter). Check to see that they are the same size. If so, we know
12409 // that the element size of the sext'd result matches the element size of
12410 // the compare operands.
12412 if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
12413 // zext(setcc) -> zext_in_reg(vsetcc) for vectors.
12414 SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
12415 N0.getOperand(1), N0.getOperand(2));
12416 return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType());
12419 // If the desired elements are smaller or larger than the source
12420 // elements we can use a matching integer vector type and then
12421 // truncate/any extend followed by zext_in_reg.
12422 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
12424 DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
12425 N0.getOperand(1), N0.getOperand(2));
12426 return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL,
12427 N0.getValueType());
12430 // zext(setcc x,y,cc) -> zext(select x, y, true, false, cc)
12432 EVT N0VT = N0.getValueType();
12433 EVT N00VT = N0.getOperand(0).getValueType();
12434 if (SDValue SCC = SimplifySelectCC(
12435 DL, N0.getOperand(0), N0.getOperand(1),
12436 DAG.getBoolConstant(true, DL, N0VT, N00VT),
12437 DAG.getBoolConstant(false, DL, N0VT, N00VT),
12438 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
12439 return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, SCC);
12442 // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
12443 if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
12444 isa<ConstantSDNode>(N0.getOperand(1)) &&
12445 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
12447 SDValue ShAmt = N0.getOperand(1);
12448 if (N0.getOpcode() == ISD::SHL) {
12449 SDValue InnerZExt = N0.getOperand(0);
12450 // If the original shl may be shifting out bits, do not perform this
12452 unsigned KnownZeroBits = InnerZExt.getValueSizeInBits() -
12453 InnerZExt.getOperand(0).getValueSizeInBits();
12454 if (cast<ConstantSDNode>(ShAmt)->getAPIntValue().ugt(KnownZeroBits))
12460 // Ensure that the shift amount is wide enough for the shifted value.
12461 if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits())
12462 ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
12464 return DAG.getNode(N0.getOpcode(), DL, VT,
12465 DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
12469 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
12472 if (SDValue NewCtPop = widenCtPop(N, DAG))
12475 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
12481 SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
12482 SDValue N0 = N->getOperand(0);
12483 EVT VT = N->getValueType(0);
12485 // aext(undef) = undef
12487 return DAG.getUNDEF(VT);
12489 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
12492 // fold (aext (aext x)) -> (aext x)
12493 // fold (aext (zext x)) -> (zext x)
12494 // fold (aext (sext x)) -> (sext x)
12495 if (N0.getOpcode() == ISD::ANY_EXTEND ||
12496 N0.getOpcode() == ISD::ZERO_EXTEND ||
12497 N0.getOpcode() == ISD::SIGN_EXTEND)
12498 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
12500 // fold (aext (truncate (load x))) -> (aext (smaller load x))
12501 // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
12502 if (N0.getOpcode() == ISD::TRUNCATE) {
12503 if (SDValue NarrowLoad = reduceLoadWidth(N0.getNode())) {
12504 SDNode *oye = N0.getOperand(0).getNode();
12505 if (NarrowLoad.getNode() != N0.getNode()) {
12506 CombineTo(N0.getNode(), NarrowLoad);
12507 // CombineTo deleted the truncate, if needed, but not what's under it.
12508 AddToWorklist(oye);
12510 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12514 // fold (aext (truncate x))
12515 if (N0.getOpcode() == ISD::TRUNCATE)
12516 return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
12518 // Fold (aext (and (trunc x), cst)) -> (and x, cst)
12519 // if the trunc is not free.
12520 if (N0.getOpcode() == ISD::AND &&
12521 N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
12522 N0.getOperand(1).getOpcode() == ISD::Constant &&
12523 !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
12524 N0.getValueType())) {
12526 SDValue X = DAG.getAnyExtOrTrunc(N0.getOperand(0).getOperand(0), DL, VT);
12527 SDValue Y = DAG.getNode(ISD::ANY_EXTEND, DL, VT, N0.getOperand(1));
12528 assert(isa<ConstantSDNode>(Y) && "Expected constant to be folded!");
12529 return DAG.getNode(ISD::AND, DL, VT, X, Y);
12532 // fold (aext (load x)) -> (aext (truncate (extload x)))
12533 // None of the supported targets knows how to perform load and any_ext
12534 // on vectors in one instruction, so attempt to fold to zext instead.
12535 if (VT.isVector()) {
12536 // Try to simplify (zext (load x)).
12537 if (SDValue foldedExt =
12538 tryToFoldExtOfLoad(DAG, *this, TLI, VT, LegalOperations, N, N0,
12539 ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
12541 } else if (ISD::isNON_EXTLoad(N0.getNode()) &&
12542 ISD::isUNINDEXEDLoad(N0.getNode()) &&
12543 TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
12544 bool DoXform = true;
12545 SmallVector<SDNode *, 4> SetCCs;
12546 if (!N0.hasOneUse())
12548 ExtendUsesToFormExtLoad(VT, N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
12550 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12551 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
12552 LN0->getChain(), LN0->getBasePtr(),
12553 N0.getValueType(), LN0->getMemOperand());
12554 ExtendSetCCUses(SetCCs, N0, ExtLoad, ISD::ANY_EXTEND);
12555 // If the load value is used only by N, replace it via CombineTo N.
12556 bool NoReplaceTrunc = N0.hasOneUse();
12557 CombineTo(N, ExtLoad);
12558 if (NoReplaceTrunc) {
12559 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
12560 recursivelyDeleteUnusedNodes(LN0);
12563 DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), ExtLoad);
12564 CombineTo(LN0, Trunc, ExtLoad.getValue(1));
12566 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12570 // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
12571 // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
12572 // fold (aext ( extload x)) -> (aext (truncate (extload x)))
12573 if (N0.getOpcode() == ISD::LOAD && !ISD::isNON_EXTLoad(N0.getNode()) &&
12574 ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
12575 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12576 ISD::LoadExtType ExtType = LN0->getExtensionType();
12577 EVT MemVT = LN0->getMemoryVT();
12578 if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
12579 SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
12580 VT, LN0->getChain(), LN0->getBasePtr(),
12581 MemVT, LN0->getMemOperand());
12582 CombineTo(N, ExtLoad);
12583 DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), ExtLoad.getValue(1));
12584 recursivelyDeleteUnusedNodes(LN0);
12585 return SDValue(N, 0); // Return N so it doesn't get rechecked!
12589 if (N0.getOpcode() == ISD::SETCC) {
12590 // Propagate fast-math-flags.
12591 SelectionDAG::FlagInserter FlagsInserter(DAG, N0->getFlags());
12594 // aext(setcc) -> vsetcc
12595 // aext(setcc) -> truncate(vsetcc)
12596 // aext(setcc) -> aext(vsetcc)
12597 // Only do this before legalize for now.
12598 if (VT.isVector() && !LegalOperations) {
12599 EVT N00VT = N0.getOperand(0).getValueType();
12600 if (getSetCCResultType(N00VT) == N0.getValueType())
12603 // We know that the # elements of the results is the same as the
12604 // # elements of the compare (and the # elements of the compare result
12605 // for that matter). Check to see that they are the same size. If so,
12606 // we know that the element size of the sext'd result matches the
12607 // element size of the compare operands.
12608 if (VT.getSizeInBits() == N00VT.getSizeInBits())
12609 return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
12611 cast<CondCodeSDNode>(N0.getOperand(2))->get());
12613 // If the desired elements are smaller or larger than the source
12614 // elements we can use a matching integer vector type and then
12615 // truncate/any extend
12616 EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
12618 DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
12620 cast<CondCodeSDNode>(N0.getOperand(2))->get());
12621 return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
12624 // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
12626 if (SDValue SCC = SimplifySelectCC(
12627 DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
12628 DAG.getConstant(0, DL, VT),
12629 cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
12633 if (SDValue NewCtPop = widenCtPop(N, DAG))
12636 if (SDValue Res = tryToFoldExtendSelectLoad(N, TLI, DAG))
12642 SDValue DAGCombiner::visitAssertExt(SDNode *N) {
12643 unsigned Opcode = N->getOpcode();
12644 SDValue N0 = N->getOperand(0);
12645 SDValue N1 = N->getOperand(1);
12646 EVT AssertVT = cast<VTSDNode>(N1)->getVT();
12648 // fold (assert?ext (assert?ext x, vt), vt) -> (assert?ext x, vt)
12649 if (N0.getOpcode() == Opcode &&
12650 AssertVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
12653 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
12654 N0.getOperand(0).getOpcode() == Opcode) {
12655 // We have an assert, truncate, assert sandwich. Make one stronger assert
12656 // by asserting on the smallest asserted type to the larger source type.
12657 // This eliminates the later assert:
12658 // assert (trunc (assert X, i8) to iN), i1 --> trunc (assert X, i1) to iN
12659 // assert (trunc (assert X, i1) to iN), i8 --> trunc (assert X, i1) to iN
12661 SDValue BigA = N0.getOperand(0);
12662 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
12663 EVT MinAssertVT = AssertVT.bitsLT(BigA_AssertVT) ? AssertVT : BigA_AssertVT;
12664 SDValue MinAssertVTVal = DAG.getValueType(MinAssertVT);
12665 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
12666 BigA.getOperand(0), MinAssertVTVal);
12667 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
12670 // If we have (AssertZext (truncate (AssertSext X, iX)), iY) and Y is smaller
12671 // than X. Just move the AssertZext in front of the truncate and drop the
12673 if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse() &&
12674 N0.getOperand(0).getOpcode() == ISD::AssertSext &&
12675 Opcode == ISD::AssertZext) {
12676 SDValue BigA = N0.getOperand(0);
12677 EVT BigA_AssertVT = cast<VTSDNode>(BigA.getOperand(1))->getVT();
12678 if (AssertVT.bitsLT(BigA_AssertVT)) {
12680 SDValue NewAssert = DAG.getNode(Opcode, DL, BigA.getValueType(),
12681 BigA.getOperand(0), N1);
12682 return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), NewAssert);
12689 SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
12692 Align AL = cast<AssertAlignSDNode>(N)->getAlign();
12693 SDValue N0 = N->getOperand(0);
12695 // Fold (assertalign (assertalign x, AL0), AL1) ->
12696 // (assertalign x, max(AL0, AL1))
12697 if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
12698 return DAG.getAssertAlign(DL, N0.getOperand(0),
12699 std::max(AL, AAN->getAlign()));
12701 // In rare cases, there are trivial arithmetic ops in source operands. Sink
12702 // this assert down to source operands so that those arithmetic ops could be
12703 // exposed to the DAG combining.
12704 switch (N0.getOpcode()) {
12709 unsigned AlignShift = Log2(AL);
12710 SDValue LHS = N0.getOperand(0);
12711 SDValue RHS = N0.getOperand(1);
12712 unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
12713 unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
12714 if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {
12715 if (LHSAlignShift < AlignShift)
12716 LHS = DAG.getAssertAlign(DL, LHS, AL);
12717 if (RHSAlignShift < AlignShift)
12718 RHS = DAG.getAssertAlign(DL, RHS, AL);
12719 return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
12728 /// If the result of a load is shifted/masked/truncated to an effectively
12729 /// narrower type, try to transform the load to a narrower type and/or
12730 /// use an extending load.
12731 SDValue DAGCombiner::reduceLoadWidth(SDNode *N) {
12732 unsigned Opc = N->getOpcode();
12734 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
12735 SDValue N0 = N->getOperand(0);
12736 EVT VT = N->getValueType(0);
12739 // This transformation isn't valid for vector loads.
12743 // The ShAmt variable is used to indicate that we've consumed a right
12744 // shift. I.e. we want to narrow the width of the load by skipping to load the
12745 // ShAmt least significant bits.
12746 unsigned ShAmt = 0;
12747 // A special case is when the least significant bits from the load are masked
12748 // away, but using an AND rather than a right shift. HasShiftedOffset is used
12749 // to indicate that the narrowed load should be left-shifted ShAmt bits to get
12751 bool HasShiftedOffset = false;
12752 // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
12754 if (Opc == ISD::SIGN_EXTEND_INREG) {
12755 ExtType = ISD::SEXTLOAD;
12756 ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
12757 } else if (Opc == ISD::SRL || Opc == ISD::SRA) {
12758 // Another special-case: SRL/SRA is basically zero/sign-extending a narrower
12759 // value, or it may be shifting a higher subword, half or byte into the
12762 // Only handle shift with constant shift amount, and the shiftee must be a
12764 auto *LN = dyn_cast<LoadSDNode>(N0);
12765 auto *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
12768 // If the shift amount is larger than the memory type then we're not
12769 // accessing any of the loaded bytes.
12770 ShAmt = N1C->getZExtValue();
12771 uint64_t MemoryWidth = LN->getMemoryVT().getScalarSizeInBits();
12772 if (MemoryWidth <= ShAmt)
12774 // Attempt to fold away the SRL by using ZEXTLOAD and SRA by using SEXTLOAD.
12775 ExtType = Opc == ISD::SRL ? ISD::ZEXTLOAD : ISD::SEXTLOAD;
12776 ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
12777 // If original load is a SEXTLOAD then we can't simply replace it by a
12778 // ZEXTLOAD (we could potentially replace it by a more narrow SEXTLOAD
12779 // followed by a ZEXT, but that is not handled at the moment). Similarly if
12780 // the original load is a ZEXTLOAD and we want to use a SEXTLOAD.
12781 if ((LN->getExtensionType() == ISD::SEXTLOAD ||
12782 LN->getExtensionType() == ISD::ZEXTLOAD) &&
12783 LN->getExtensionType() != ExtType)
12785 } else if (Opc == ISD::AND) {
12786 // An AND with a constant mask is the same as a truncate + zero-extend.
12787 auto AndC = dyn_cast<ConstantSDNode>(N->getOperand(1));
12791 const APInt &Mask = AndC->getAPIntValue();
12792 unsigned ActiveBits = 0;
12793 if (Mask.isMask()) {
12794 ActiveBits = Mask.countTrailingOnes();
12795 } else if (Mask.isShiftedMask(ShAmt, ActiveBits)) {
12796 HasShiftedOffset = true;
12801 ExtType = ISD::ZEXTLOAD;
12802 ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
12805 // In case Opc==SRL we've already prepared ExtVT/ExtType/ShAmt based on doing
12806 // a right shift. Here we redo some of those checks, to possibly adjust the
12807 // ExtVT even further based on "a masking AND". We could also end up here for
12808 // other reasons (e.g. based on Opc==TRUNCATE) and that is why some checks
12809 // need to be done here as well.
12810 if (Opc == ISD::SRL || N0.getOpcode() == ISD::SRL) {
12811 SDValue SRL = Opc == ISD::SRL ? SDValue(N, 0) : N0;
12812 // Bail out when the SRL has more than one use. This is done for historical
12813 // (undocumented) reasons. Maybe intent was to guard the AND-masking below
12814 // check below? And maybe it could be non-profitable to do the transform in
12815 // case the SRL has multiple uses and we get here with Opc!=ISD::SRL?
12816 // FIXME: Can't we just skip this check for the Opc==ISD::SRL case.
12817 if (!SRL.hasOneUse())
12820 // Only handle shift with constant shift amount, and the shiftee must be a
12822 auto *LN = dyn_cast<LoadSDNode>(SRL.getOperand(0));
12823 auto *SRL1C = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
12827 // If the shift amount is larger than the input type then we're not
12828 // accessing any of the loaded bytes. If the load was a zextload/extload
12829 // then the result of the shift+trunc is zero/undef (handled elsewhere).
12830 ShAmt = SRL1C->getZExtValue();
12831 uint64_t MemoryWidth = LN->getMemoryVT().getSizeInBits();
12832 if (ShAmt >= MemoryWidth)
12835 // Because a SRL must be assumed to *need* to zero-extend the high bits
12836 // (as opposed to anyext the high bits), we can't combine the zextload
12837 // lowering of SRL and an sextload.
12838 if (LN->getExtensionType() == ISD::SEXTLOAD)
12841 // Avoid reading outside the memory accessed by the original load (could
12842 // happened if we only adjust the load base pointer by ShAmt). Instead we
12843 // try to narrow the load even further. The typical scenario here is:
12844 // (i64 (truncate (i96 (srl (load x), 64)))) ->
12845 // (i64 (truncate (i96 (zextload (load i32 + offset) from i32))))
12846 if (ExtVT.getScalarSizeInBits() > MemoryWidth - ShAmt) {
12847 // Don't replace sextload by zextload.
12848 if (ExtType == ISD::SEXTLOAD)
12850 // Narrow the load.
12851 ExtType = ISD::ZEXTLOAD;
12852 ExtVT = EVT::getIntegerVT(*DAG.getContext(), MemoryWidth - ShAmt);
12855 // If the SRL is only used by a masking AND, we may be able to adjust
12856 // the ExtVT to make the AND redundant.
12857 SDNode *Mask = *(SRL->use_begin());
12858 if (SRL.hasOneUse() && Mask->getOpcode() == ISD::AND &&
12859 isa<ConstantSDNode>(Mask->getOperand(1))) {
12860 const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
12861 if (ShiftMask.isMask()) {
12862 EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
12863 ShiftMask.countTrailingOnes());
12864 // If the mask is smaller, recompute the type.
12865 if ((ExtVT.getScalarSizeInBits() > MaskedVT.getScalarSizeInBits()) &&
12866 TLI.isLoadExtLegal(ExtType, SRL.getValueType(), MaskedVT))
12871 N0 = SRL.getOperand(0);
12874 // If the load is shifted left (and the result isn't shifted back right), we
12875 // can fold a truncate through the shift. The typical scenario is that N
12876 // points at a TRUNCATE here so the attempted fold is:
12877 // (truncate (shl (load x), c))) -> (shl (narrow load x), c)
12878 // ShLeftAmt will indicate how much a narrowed load should be shifted left.
12879 unsigned ShLeftAmt = 0;
12880 if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
12881 ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
12882 if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
12883 ShLeftAmt = N01->getZExtValue();
12884 N0 = N0.getOperand(0);
12888 // If we haven't found a load, we can't narrow it.
12889 if (!isa<LoadSDNode>(N0))
12892 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
12893 // Reducing the width of a volatile load is illegal. For atomics, we may be
12894 // able to reduce the width provided we never widen again. (see D66309)
12895 if (!LN0->isSimple() ||
12896 !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
12899 auto AdjustBigEndianShift = [&](unsigned ShAmt) {
12900 unsigned LVTStoreBits =
12901 LN0->getMemoryVT().getStoreSizeInBits().getFixedSize();
12902 unsigned EVTStoreBits = ExtVT.getStoreSizeInBits().getFixedSize();
12903 return LVTStoreBits - EVTStoreBits - ShAmt;
12906 // We need to adjust the pointer to the load by ShAmt bits in order to load
12907 // the correct bytes.
12908 unsigned PtrAdjustmentInBits =
12909 DAG.getDataLayout().isBigEndian() ? AdjustBigEndianShift(ShAmt) : ShAmt;
12911 uint64_t PtrOff = PtrAdjustmentInBits / 8;
12912 Align NewAlign = commonAlignment(LN0->getAlign(), PtrOff);
12914 // The original load itself didn't wrap, so an offset within it doesn't.
12916 Flags.setNoUnsignedWrap(true);
12917 SDValue NewPtr = DAG.getMemBasePlusOffset(LN0->getBasePtr(),
12918 TypeSize::Fixed(PtrOff), DL, Flags);
12919 AddToWorklist(NewPtr.getNode());
12922 if (ExtType == ISD::NON_EXTLOAD)
12923 Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr,
12924 LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
12925 LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
12927 Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr,
12928 LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
12929 NewAlign, LN0->getMemOperand()->getFlags(),
12932 // Replace the old load's chain with the new load's chain.
12933 WorklistRemover DeadNodes(*this);
12934 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
12936 // Shift the result left, if we've swallowed a left shift.
12937 SDValue Result = Load;
12938 if (ShLeftAmt != 0) {
12939 EVT ShImmTy = getShiftAmountTy(Result.getValueType());
12940 if (!isUIntN(ShImmTy.getScalarSizeInBits(), ShLeftAmt))
12942 // If the shift amount is as large as the result size (but, presumably,
12943 // no larger than the source) then the useful bits of the result are
12944 // zero; we can't simply return the shortened shift, because the result
12945 // of that operation is undefined.
12946 if (ShLeftAmt >= VT.getScalarSizeInBits())
12947 Result = DAG.getConstant(0, DL, VT);
12949 Result = DAG.getNode(ISD::SHL, DL, VT,
12950 Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
12953 if (HasShiftedOffset) {
12954 // We're using a shifted mask, so the load now has an offset. This means
12955 // that data has been loaded into the lower bytes than it would have been
12956 // before, so we need to shl the loaded data into the correct position in the
12958 SDValue ShiftC = DAG.getConstant(ShAmt, DL, VT);
12959 Result = DAG.getNode(ISD::SHL, DL, VT, Result, ShiftC);
12960 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
12963 // Return the new loaded value.
12967 SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
12968 SDValue N0 = N->getOperand(0);
12969 SDValue N1 = N->getOperand(1);
12970 EVT VT = N->getValueType(0);
12971 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
12972 unsigned VTBits = VT.getScalarSizeInBits();
12973 unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
12975 // sext_vector_inreg(undef) = 0 because the top bit will all be the same.
12977 return DAG.getConstant(0, SDLoc(N), VT);
12979 // fold (sext_in_reg c1) -> c1
12980 if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
12981 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
12983 // If the input is already sign extended, just drop the extension.
12984 if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0))
12987 // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
12988 if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
12989 ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
12990 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),
12993 // fold (sext_in_reg (sext x)) -> (sext x)
12994 // fold (sext_in_reg (aext x)) -> (sext x)
12995 // if x is small enough or if we know that x has more than 1 sign bit and the
12996 // sign_extend_inreg is extending from one of them.
12997 if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
12998 SDValue N00 = N0.getOperand(0);
12999 unsigned N00Bits = N00.getScalarValueSizeInBits();
13000 if ((N00Bits <= ExtVTBits ||
13001 DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits) &&
13002 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
13003 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
13006 // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
13007 // if x is small enough or if we know that x has more than 1 sign bit and the
13008 // sign_extend_inreg is extending from one of them.
13009 if (N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
13010 N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
13011 N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) {
13012 SDValue N00 = N0.getOperand(0);
13013 unsigned N00Bits = N00.getScalarValueSizeInBits();
13014 unsigned DstElts = N0.getValueType().getVectorMinNumElements();
13015 unsigned SrcElts = N00.getValueType().getVectorMinNumElements();
13016 bool IsZext = N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG;
13017 APInt DemandedSrcElts = APInt::getLowBitsSet(SrcElts, DstElts);
13018 if ((N00Bits == ExtVTBits ||
13019 (!IsZext && (N00Bits < ExtVTBits ||
13020 DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits))) &&
13021 (!LegalOperations ||
13022 TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)))
13023 return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00);
13026 // fold (sext_in_reg (zext x)) -> (sext x)
13027 // iff we are extending the source sign bit.
13028 if (N0.getOpcode() == ISD::ZERO_EXTEND) {
13029 SDValue N00 = N0.getOperand(0);
13030 if (N00.getScalarValueSizeInBits() == ExtVTBits &&
13031 (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
13032 return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
13035 // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
13036 if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))
13037 return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);
13039 // fold operands of sext_in_reg based on knowledge that the top bits are not
13041 if (SimplifyDemandedBits(SDValue(N, 0)))
13042 return SDValue(N, 0);
13044 // fold (sext_in_reg (load x)) -> (smaller sextload x)
13045 // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
13046 if (SDValue NarrowLoad = reduceLoadWidth(N))
13049 // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
13050 // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
13051 // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
13052 if (N0.getOpcode() == ISD::SRL) {
13053 if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
13054 if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) {
13055 // We can turn this into an SRA iff the input to the SRL is already sign
13056 // extended enough.
13057 unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
13058 if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)
13059 return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
13064 // fold (sext_inreg (extload x)) -> (sextload x)
13065 // If sextload is not supported by target, we can only do the combine when
13066 // load has one use. Doing otherwise can block folding the extload with other
13067 // extends that the target does support.
13068 if (ISD::isEXTLoad(N0.getNode()) &&
13069 ISD::isUNINDEXEDLoad(N0.getNode()) &&
13070 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
13071 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
13073 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
13074 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13075 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
13077 LN0->getBasePtr(), ExtVT,
13078 LN0->getMemOperand());
13079 CombineTo(N, ExtLoad);
13080 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
13081 AddToWorklist(ExtLoad.getNode());
13082 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13085 // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
13086 if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
13088 ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
13089 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
13090 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
13091 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13092 SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
13094 LN0->getBasePtr(), ExtVT,
13095 LN0->getMemOperand());
13096 CombineTo(N, ExtLoad);
13097 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
13098 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13101 // fold (sext_inreg (masked_load x)) -> (sext_masked_load x)
13102 // ignore it if the masked load is already sign extended
13103 if (MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0)) {
13104 if (ExtVT == Ld->getMemoryVT() && N0.hasOneUse() &&
13105 Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD &&
13106 TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) {
13107 SDValue ExtMaskedLoad = DAG.getMaskedLoad(
13108 VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(),
13109 Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(),
13110 Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad());
13111 CombineTo(N, ExtMaskedLoad);
13112 CombineTo(N0.getNode(), ExtMaskedLoad, ExtMaskedLoad.getValue(1));
13113 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13117 // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
13118 if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
13119 if (SDValue(GN0, 0).hasOneUse() &&
13120 ExtVT == GN0->getMemoryVT() &&
13121 TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
13122 SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(),
13123 GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()};
13125 SDValue ExtLoad = DAG.getMaskedGather(
13126 DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
13127 GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
13129 CombineTo(N, ExtLoad);
13130 CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
13131 AddToWorklist(ExtLoad.getNode());
13132 return SDValue(N, 0); // Return N so it doesn't get rechecked!
13136 // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
13137 if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
13138 if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
13139 N0.getOperand(1), false))
13140 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
13146 SDValue DAGCombiner::visitEXTEND_VECTOR_INREG(SDNode *N) {
13147 SDValue N0 = N->getOperand(0);
13148 EVT VT = N->getValueType(0);
13150 // {s/z}ext_vector_inreg(undef) = 0 because the top bits must be the same.
13152 return DAG.getConstant(0, SDLoc(N), VT);
13154 if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
13157 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
13158 return SDValue(N, 0);
13163 SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
13164 SDValue N0 = N->getOperand(0);
13165 EVT VT = N->getValueType(0);
13166 EVT SrcVT = N0.getValueType();
13167 bool isLE = DAG.getDataLayout().isLittleEndian();
13173 // fold (truncate (truncate x)) -> (truncate x)
13174 if (N0.getOpcode() == ISD::TRUNCATE)
13175 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
13177 // fold (truncate c1) -> c1
13178 if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
13179 SDValue C = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
13180 if (C.getNode() != N)
13184 // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
13185 if (N0.getOpcode() == ISD::ZERO_EXTEND ||
13186 N0.getOpcode() == ISD::SIGN_EXTEND ||
13187 N0.getOpcode() == ISD::ANY_EXTEND) {
13188 // if the source is smaller than the dest, we still need an extend.
13189 if (N0.getOperand(0).getValueType().bitsLT(VT))
13190 return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
13191 // if the source is larger than the dest, than we just need the truncate.
13192 if (N0.getOperand(0).getValueType().bitsGT(VT))
13193 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
13194 // if the source and dest are the same type, we can drop both the extend
13195 // and the truncate.
13196 return N0.getOperand(0);
13199 // Try to narrow a truncate-of-sext_in_reg to the destination type:
13200 // trunc (sign_ext_inreg X, iM) to iN --> sign_ext_inreg (trunc X to iN), iM
13201 if (!LegalTypes && N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
13203 SDValue X = N0.getOperand(0);
13204 SDValue ExtVal = N0.getOperand(1);
13205 EVT ExtVT = cast<VTSDNode>(ExtVal)->getVT();
13206 if (ExtVT.bitsLT(VT)) {
13207 SDValue TrX = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
13208 return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, TrX, ExtVal);
13212 // If this is anyext(trunc), don't fold it, allow ourselves to be folded.
13213 if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
13216 // Fold extract-and-trunc into a narrow extract. For example:
13217 // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
13218 // i32 y = TRUNCATE(i64 x)
13220 // v16i8 b = BITCAST (v2i64 val)
13221 // i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
13223 // Note: We only run this optimization after type legalization (which often
13224 // creates this pattern) and before operation legalization after which
13225 // we need to be more careful about the vector instructions that we generate.
13226 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
13227 LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
13228 EVT VecTy = N0.getOperand(0).getValueType();
13229 EVT ExTy = N0.getValueType();
13230 EVT TrTy = N->getValueType(0);
13232 auto EltCnt = VecTy.getVectorElementCount();
13233 unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
13234 auto NewEltCnt = EltCnt * SizeRatio;
13236 EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
13237 assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
13239 SDValue EltNo = N0->getOperand(1);
13240 if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
13241 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
13242 int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
13245 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
13246 DAG.getBitcast(NVT, N0.getOperand(0)),
13247 DAG.getVectorIdxConstant(Index, DL));
13251 // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
13252 if (N0.getOpcode() == ISD::SELECT && N0.hasOneUse()) {
13253 if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
13254 TLI.isTruncateFree(SrcVT, VT)) {
13256 SDValue Cond = N0.getOperand(0);
13257 SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
13258 SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
13259 return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
13263 // trunc (shl x, K) -> shl (trunc x), K => K < VT.getScalarSizeInBits()
13264 if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
13265 (!LegalOperations || TLI.isOperationLegal(ISD::SHL, VT)) &&
13266 TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
13267 SDValue Amt = N0.getOperand(1);
13268 KnownBits Known = DAG.computeKnownBits(Amt);
13269 unsigned Size = VT.getScalarSizeInBits();
13270 if (Known.countMaxActiveBits() <= Log2_32(Size)) {
13272 EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
13274 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
13275 if (AmtVT != Amt.getValueType()) {
13276 Amt = DAG.getZExtOrTrunc(Amt, SL, AmtVT);
13277 AddToWorklist(Amt.getNode());
13279 return DAG.getNode(ISD::SHL, SL, VT, Trunc, Amt);
13283 if (SDValue V = foldSubToUSubSat(VT, N0.getNode()))
13286 // Attempt to pre-truncate BUILD_VECTOR sources.
13287 if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
13288 TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
13289 // Avoid creating illegal types if running after type legalizer.
13290 (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
13292 EVT SVT = VT.getScalarType();
13293 SmallVector<SDValue, 8> TruncOps;
13294 for (const SDValue &Op : N0->op_values()) {
13295 SDValue TruncOp = DAG.getNode(ISD::TRUNCATE, DL, SVT, Op);
13296 TruncOps.push_back(TruncOp);
13298 return DAG.getBuildVector(VT, DL, TruncOps);
13301 // Fold a series of buildvector, bitcast, and truncate if possible.
13302 // For example fold
13303 // (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
13304 // (2xi32 (buildvector x, y)).
13305 if (Level == AfterLegalizeVectorOps && VT.isVector() &&
13306 N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
13307 N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
13308 N0.getOperand(0).hasOneUse()) {
13309 SDValue BuildVect = N0.getOperand(0);
13310 EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
13311 EVT TruncVecEltTy = VT.getVectorElementType();
13313 // Check that the element types match.
13314 if (BuildVectEltTy == TruncVecEltTy) {
13315 // Now we only need to compute the offset of the truncated elements.
13316 unsigned BuildVecNumElts = BuildVect.getNumOperands();
13317 unsigned TruncVecNumElts = VT.getVectorNumElements();
13318 unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
13320 assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
13321 "Invalid number of elements");
13323 SmallVector<SDValue, 8> Opnds;
13324 for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
13325 Opnds.push_back(BuildVect.getOperand(i));
13327 return DAG.getBuildVector(VT, SDLoc(N), Opnds);
13331 // fold (truncate (load x)) -> (smaller load x)
13332 // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
13333 if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
13334 if (SDValue Reduced = reduceLoadWidth(N))
13337 // Handle the case where the load remains an extending load even
13338 // after truncation.
13339 if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
13340 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13341 if (LN0->isSimple() && LN0->getMemoryVT().bitsLT(VT)) {
13342 SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
13343 VT, LN0->getChain(), LN0->getBasePtr(),
13344 LN0->getMemoryVT(),
13345 LN0->getMemOperand());
13346 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
13352 // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
13353 // where ... are all 'undef'.
13354 if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
13355 SmallVector<EVT, 8> VTs;
13358 unsigned NumDefs = 0;
13360 for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
13361 SDValue X = N0.getOperand(i);
13362 if (!X.isUndef()) {
13367 // Stop if more than one members are non-undef.
13371 VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
13372 VT.getVectorElementType(),
13373 X.getValueType().getVectorElementCount()));
13377 return DAG.getUNDEF(VT);
13379 if (NumDefs == 1) {
13380 assert(V.getNode() && "The single defined operand is empty!");
13381 SmallVector<SDValue, 8> Opnds;
13382 for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
13384 Opnds.push_back(DAG.getUNDEF(VTs[i]));
13387 SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
13388 AddToWorklist(NV.getNode());
13389 Opnds.push_back(NV);
13391 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
13395 // Fold truncate of a bitcast of a vector to an extract of the low vector
13398 // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, idx
13399 if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
13400 SDValue VecSrc = N0.getOperand(0);
13401 EVT VecSrcVT = VecSrc.getValueType();
13402 if (VecSrcVT.isVector() && VecSrcVT.getScalarType() == VT &&
13403 (!LegalOperations ||
13404 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
13407 unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
13408 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
13409 DAG.getVectorIdxConstant(Idx, SL));
13413 // Simplify the operands using demanded-bits information.
13414 if (SimplifyDemandedBits(SDValue(N, 0)))
13415 return SDValue(N, 0);
13417 // fold (truncate (extract_subvector(ext x))) ->
13418 // (extract_subvector x)
13419 // TODO: This can be generalized to cover cases where the truncate and extract
13420 // do not fully cancel each other out.
13421 if (!LegalTypes && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
13422 SDValue N00 = N0.getOperand(0);
13423 if (N00.getOpcode() == ISD::SIGN_EXTEND ||
13424 N00.getOpcode() == ISD::ZERO_EXTEND ||
13425 N00.getOpcode() == ISD::ANY_EXTEND) {
13426 if (N00.getOperand(0)->getValueType(0).getVectorElementType() ==
13427 VT.getVectorElementType())
13428 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N0->getOperand(0)), VT,
13429 N00.getOperand(0), N0.getOperand(1));
13433 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
13436 // Narrow a suitable binary operation with a non-opaque constant operand by
13437 // moving it ahead of the truncate. This is limited to pre-legalization
13438 // because targets may prefer a wider type during later combines and invert
13440 switch (N0.getOpcode()) {
13447 if (!LegalOperations && N0.hasOneUse() &&
13448 (isConstantOrConstantVector(N0.getOperand(0), true) ||
13449 isConstantOrConstantVector(N0.getOperand(1), true))) {
13450 // TODO: We already restricted this to pre-legalization, but for vectors
13451 // we are extra cautious to not create an unsupported operation.
13452 // Target-specific changes are likely needed to avoid regressions here.
13453 if (VT.isScalarInteger() || TLI.isOperationLegal(N0.getOpcode(), VT)) {
13455 SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
13456 SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
13457 return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
13462 case ISD::ADDCARRY:
13463 // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
13464 // (trunc addcarry(X, Y, Carry)) -> (addcarry trunc(X), trunc(Y), Carry)
13465 // When the adde's carry is not used.
13466 // We only do for addcarry before legalize operation
13467 if (((!LegalOperations && N0.getOpcode() == ISD::ADDCARRY) ||
13468 TLI.isOperationLegal(N0.getOpcode(), VT)) &&
13469 N0.hasOneUse() && !N0->hasAnyUseOfValue(1)) {
13471 SDValue X = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
13472 SDValue Y = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
13473 SDVTList VTs = DAG.getVTList(VT, N0->getValueType(1));
13474 return DAG.getNode(N0.getOpcode(), DL, VTs, X, Y, N0.getOperand(2));
13478 // Truncate the USUBSAT only if LHS is a known zero-extension, its not
13479 // enough to know that the upper bits are zero we must ensure that we don't
13480 // introduce an extra truncate.
13481 if (!LegalOperations && N0.hasOneUse() &&
13482 N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
13483 N0.getOperand(0).getOperand(0).getScalarValueSizeInBits() <=
13484 VT.getScalarSizeInBits() &&
13485 hasOperation(N0.getOpcode(), VT)) {
13486 return getTruncatedUSUBSAT(VT, SrcVT, N0.getOperand(0), N0.getOperand(1),
13495 static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
13496 SDValue Elt = N->getOperand(i);
13497 if (Elt.getOpcode() != ISD::MERGE_VALUES)
13498 return Elt.getNode();
13499 return Elt.getOperand(Elt.getResNo()).getNode();
13502 /// build_pair (load, load) -> load
13503 /// if load locations are consecutive.
13504 SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
13505 assert(N->getOpcode() == ISD::BUILD_PAIR);
13507 auto *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
13508 auto *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
13510 // A BUILD_PAIR is always having the least significant part in elt 0 and the
13511 // most significant part in elt 1. So when combining into one large load, we
13512 // need to consider the endianness.
13513 if (DAG.getDataLayout().isBigEndian())
13514 std::swap(LD1, LD2);
13516 if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !ISD::isNON_EXTLoad(LD2) ||
13517 !LD1->hasOneUse() || !LD2->hasOneUse() ||
13518 LD1->getAddressSpace() != LD2->getAddressSpace())
13521 bool LD1Fast = false;
13522 EVT LD1VT = LD1->getValueType(0);
13523 unsigned LD1Bytes = LD1VT.getStoreSize();
13524 if ((!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
13525 DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1) &&
13526 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
13527 *LD1->getMemOperand(), &LD1Fast) && LD1Fast)
13528 return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
13529 LD1->getPointerInfo(), LD1->getAlign());
13534 static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
13535 // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
13536 // and Lo parts; on big-endian machines it doesn't.
13537 return DAG.getDataLayout().isBigEndian() ? 1 : 0;
13540 static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
13541 const TargetLowering &TLI) {
13542 // If this is not a bitcast to an FP type or if the target doesn't have
13543 // IEEE754-compliant FP logic, we're done.
13544 EVT VT = N->getValueType(0);
13545 if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT))
13548 // TODO: Handle cases where the integer constant is a different scalar
13549 // bitwidth to the FP.
13550 SDValue N0 = N->getOperand(0);
13551 EVT SourceVT = N0.getValueType();
13552 if (VT.getScalarSizeInBits() != SourceVT.getScalarSizeInBits())
13557 switch (N0.getOpcode()) {
13559 FPOpcode = ISD::FABS;
13560 SignMask = ~APInt::getSignMask(SourceVT.getScalarSizeInBits());
13563 FPOpcode = ISD::FNEG;
13564 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
13567 FPOpcode = ISD::FABS;
13568 SignMask = APInt::getSignMask(SourceVT.getScalarSizeInBits());
13574 // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
13575 // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
13576 // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) ->
13578 SDValue LogicOp0 = N0.getOperand(0);
13579 ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true);
13580 if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
13581 LogicOp0.getOpcode() == ISD::BITCAST &&
13582 LogicOp0.getOperand(0).getValueType() == VT) {
13583 SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0));
13584 NumFPLogicOpsConv++;
13585 if (N0.getOpcode() == ISD::OR)
13586 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp);
13593 SDValue DAGCombiner::visitBITCAST(SDNode *N) {
13594 SDValue N0 = N->getOperand(0);
13595 EVT VT = N->getValueType(0);
13598 return DAG.getUNDEF(VT);
13600 // If the input is a BUILD_VECTOR with all constant elements, fold this now.
13601 // Only do this before legalize types, unless both types are integer and the
13602 // scalar type is legal. Only do this before legalize ops, since the target
13603 // maybe depending on the bitcast.
13604 // First check to see if this is all constant.
13605 // TODO: Support FP bitcasts after legalize types.
13606 if (VT.isVector() &&
13608 (!LegalOperations && VT.isInteger() && N0.getValueType().isInteger() &&
13609 TLI.isTypeLegal(VT.getVectorElementType()))) &&
13610 N0.getOpcode() == ISD::BUILD_VECTOR && N0->hasOneUse() &&
13611 cast<BuildVectorSDNode>(N0)->isConstant())
13612 return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(),
13613 VT.getVectorElementType());
13615 // If the input is a constant, let getNode fold it.
13616 if (isIntOrFPConstant(N0)) {
13617 // If we can't allow illegal operations, we need to check that this is just
13618 // a fp -> int or int -> conversion and that the resulting operation will
13620 if (!LegalOperations ||
13621 (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
13622 TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
13623 (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
13624 TLI.isOperationLegal(ISD::Constant, VT))) {
13625 SDValue C = DAG.getBitcast(VT, N0);
13626 if (C.getNode() != N)
13631 // (conv (conv x, t1), t2) -> (conv x, t2)
13632 if (N0.getOpcode() == ISD::BITCAST)
13633 return DAG.getBitcast(VT, N0.getOperand(0));
13635 // fold (conv (load x)) -> (load (conv*)x)
13636 // If the resultant load doesn't need a higher alignment than the original!
13637 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
13638 // Do not remove the cast if the types differ in endian layout.
13639 TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
13640 TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
13641 // If the load is volatile, we only want to change the load type if the
13642 // resulting load is legal. Otherwise we might increase the number of
13643 // memory accesses. We don't care if the original type was legal or not
13644 // as we assume software couldn't rely on the number of accesses of an
13646 ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
13647 TLI.isOperationLegal(ISD::LOAD, VT))) {
13648 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
13650 if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
13651 *LN0->getMemOperand())) {
13653 DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
13654 LN0->getPointerInfo(), LN0->getAlign(),
13655 LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
13656 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
13661 if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
13664 // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
13665 // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
13668 // fold (bitcast (fneg x)) ->
13669 // flipbit = signbit
13670 // (xor (bitcast x) (build_pair flipbit, flipbit))
13672 // fold (bitcast (fabs x)) ->
13673 // flipbit = (and (extract_element (bitcast x), 0), signbit)
13674 // (xor (bitcast x) (build_pair flipbit, flipbit))
13675 // This often reduces constant pool loads.
13676 if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
13677 (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
13678 N0->hasOneUse() && VT.isInteger() && !VT.isVector() &&
13679 !N0.getValueType().isVector()) {
13680 SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
13681 AddToWorklist(NewConv.getNode());
13684 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
13685 assert(VT.getSizeInBits() == 128);
13686 SDValue SignBit = DAG.getConstant(
13687 APInt::getSignMask(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
13689 if (N0.getOpcode() == ISD::FNEG) {
13691 AddToWorklist(FlipBit.getNode());
13693 assert(N0.getOpcode() == ISD::FABS);
13695 DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
13696 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
13698 AddToWorklist(Hi.getNode());
13699 FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
13700 AddToWorklist(FlipBit.getNode());
13703 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
13704 AddToWorklist(FlipBits.getNode());
13705 return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
13707 APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
13708 if (N0.getOpcode() == ISD::FNEG)
13709 return DAG.getNode(ISD::XOR, DL, VT,
13710 NewConv, DAG.getConstant(SignBit, DL, VT));
13711 assert(N0.getOpcode() == ISD::FABS);
13712 return DAG.getNode(ISD::AND, DL, VT,
13713 NewConv, DAG.getConstant(~SignBit, DL, VT));
13716 // fold (bitconvert (fcopysign cst, x)) ->
13717 // (or (and (bitconvert x), sign), (and cst, (not sign)))
13718 // Note that we don't handle (copysign x, cst) because this can always be
13719 // folded to an fneg or fabs.
13722 // fold (bitcast (fcopysign cst, x)) ->
13723 // flipbit = (and (extract_element
13724 // (xor (bitcast cst), (bitcast x)), 0),
13726 // (xor (bitcast cst) (build_pair flipbit, flipbit))
13727 if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse() &&
13728 isa<ConstantFPSDNode>(N0.getOperand(0)) && VT.isInteger() &&
13730 unsigned OrigXWidth = N0.getOperand(1).getValueSizeInBits();
13731 EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
13732 if (isTypeLegal(IntXVT)) {
13733 SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
13734 AddToWorklist(X.getNode());
13736 // If X has a different width than the result/lhs, sext it or truncate it.
13737 unsigned VTWidth = VT.getSizeInBits();
13738 if (OrigXWidth < VTWidth) {
13739 X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
13740 AddToWorklist(X.getNode());
13741 } else if (OrigXWidth > VTWidth) {
13742 // To get the sign bit in the right place, we have to shift it right
13743 // before truncating.
13745 X = DAG.getNode(ISD::SRL, DL,
13746 X.getValueType(), X,
13747 DAG.getConstant(OrigXWidth-VTWidth, DL,
13748 X.getValueType()));
13749 AddToWorklist(X.getNode());
13750 X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
13751 AddToWorklist(X.getNode());
13754 if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
13755 APInt SignBit = APInt::getSignMask(VT.getSizeInBits() / 2);
13756 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
13757 AddToWorklist(Cst.getNode());
13758 SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
13759 AddToWorklist(X.getNode());
13760 SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
13761 AddToWorklist(XorResult.getNode());
13762 SDValue XorResult64 = DAG.getNode(
13763 ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
13764 DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
13765 SDLoc(XorResult)));
13766 AddToWorklist(XorResult64.getNode());
13768 DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
13769 DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
13770 AddToWorklist(FlipBit.getNode());
13772 DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
13773 AddToWorklist(FlipBits.getNode());
13774 return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
13776 APInt SignBit = APInt::getSignMask(VT.getSizeInBits());
13777 X = DAG.getNode(ISD::AND, SDLoc(X), VT,
13778 X, DAG.getConstant(SignBit, SDLoc(X), VT));
13779 AddToWorklist(X.getNode());
13781 SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
13782 Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
13783 Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
13784 AddToWorklist(Cst.getNode());
13786 return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
13790 // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
13791 if (N0.getOpcode() == ISD::BUILD_PAIR)
13792 if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
13795 // Remove double bitcasts from shuffles - this is often a legacy of
13796 // XformToShuffleWithZero being used to combine bitmaskings (of
13797 // float vectors bitcast to integer vectors) into shuffles.
13798 // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
13799 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
13800 N0->getOpcode() == ISD::VECTOR_SHUFFLE && N0.hasOneUse() &&
13801 VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
13802 !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
13803 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
13805 // If operands are a bitcast, peek through if it casts the original VT.
13806 // If operands are a constant, just bitcast back to original VT.
13807 auto PeekThroughBitcast = [&](SDValue Op) {
13808 if (Op.getOpcode() == ISD::BITCAST &&
13809 Op.getOperand(0).getValueType() == VT)
13810 return SDValue(Op.getOperand(0));
13811 if (Op.isUndef() || isAnyConstantBuildVector(Op))
13812 return DAG.getBitcast(VT, Op);
13816 // FIXME: If either input vector is bitcast, try to convert the shuffle to
13817 // the result type of this bitcast. This would eliminate at least one
13818 // bitcast. See the transform in InstCombine.
13819 SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
13820 SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
13825 VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
13826 SmallVector<int, 8> NewMask;
13827 for (int M : SVN->getMask())
13828 for (int i = 0; i != MaskScale; ++i)
13829 NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
13831 SDValue LegalShuffle =
13832 TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG);
13834 return LegalShuffle;
13840 SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
13841 EVT VT = N->getValueType(0);
13842 return CombineConsecutiveLoads(N, VT);
13845 SDValue DAGCombiner::visitFREEZE(SDNode *N) {
13846 SDValue N0 = N->getOperand(0);
13848 if (DAG.isGuaranteedNotToBeUndefOrPoison(N0, /*PoisonOnly*/ false))
13851 // Fold freeze(bitcast(x)) -> bitcast(freeze(x)).
13852 // TODO: Replace with pushFreezeToPreventPoisonFromPropagating fold.
13853 if (N0.getOpcode() == ISD::BITCAST)
13854 return DAG.getBitcast(N->getValueType(0),
13855 DAG.getNode(ISD::FREEZE, SDLoc(N0),
13856 N0.getOperand(0).getValueType(),
13857 N0.getOperand(0)));
13862 /// We know that BV is a build_vector node with Constant, ConstantFP or Undef
13863 /// operands. DstEltVT indicates the destination element value type.
13864 SDValue DAGCombiner::
13865 ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
13866 EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
13868 // If this is already the right type, we're done.
13869 if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
13871 unsigned SrcBitSize = SrcEltVT.getSizeInBits();
13872 unsigned DstBitSize = DstEltVT.getSizeInBits();
13874 // If this is a conversion of N elements of one type to N elements of another
13875 // type, convert each element. This handles FP<->INT cases.
13876 if (SrcBitSize == DstBitSize) {
13877 SmallVector<SDValue, 8> Ops;
13878 for (SDValue Op : BV->op_values()) {
13879 // If the vector element type is not legal, the BUILD_VECTOR operands
13880 // are promoted and implicitly truncated. Make that explicit here.
13881 if (Op.getValueType() != SrcEltVT)
13882 Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
13883 Ops.push_back(DAG.getBitcast(DstEltVT, Op));
13884 AddToWorklist(Ops.back().getNode());
13886 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
13887 BV->getValueType(0).getVectorNumElements());
13888 return DAG.getBuildVector(VT, SDLoc(BV), Ops);
13891 // Otherwise, we're growing or shrinking the elements. To avoid having to
13892 // handle annoying details of growing/shrinking FP values, we convert them to
13894 if (SrcEltVT.isFloatingPoint()) {
13895 // Convert the input float vector to a int vector where the elements are the
13897 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
13898 BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
13902 // Now we know the input is an integer vector. If the output is a FP type,
13903 // convert to integer first, then to FP of the right size.
13904 if (DstEltVT.isFloatingPoint()) {
13905 EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
13906 SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
13908 // Next, convert to FP elements of the same size.
13909 return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
13912 // Okay, we know the src/dst types are both integers of differing types.
13913 assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
13915 // TODO: Should ConstantFoldBITCASTofBUILD_VECTOR always take a
13916 // BuildVectorSDNode?
13917 auto *BVN = cast<BuildVectorSDNode>(BV);
13919 // Extract the constant raw bit data.
13920 BitVector UndefElements;
13921 SmallVector<APInt> RawBits;
13922 bool IsLE = DAG.getDataLayout().isLittleEndian();
13923 if (!BVN->getConstantRawBits(IsLE, DstBitSize, RawBits, UndefElements))
13927 SmallVector<SDValue, 8> Ops;
13928 for (unsigned I = 0, E = RawBits.size(); I != E; ++I) {
13929 if (UndefElements[I])
13930 Ops.push_back(DAG.getUNDEF(DstEltVT));
13932 Ops.push_back(DAG.getConstant(RawBits[I], DL, DstEltVT));
13935 EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
13936 return DAG.getBuildVector(VT, DL, Ops);
13939 // Returns true if floating point contraction is allowed on the FMUL-SDValue
13941 static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
13942 assert(N.getOpcode() == ISD::FMUL);
13944 return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13945 N->getFlags().hasAllowContract();
13948 // Returns true if `N` can assume no infinities involved in its computation.
13949 static bool hasNoInfs(const TargetOptions &Options, SDValue N) {
13950 return Options.NoInfsFPMath || N->getFlags().hasNoInfs();
13953 /// Try to perform FMA combining on a given FADD node.
13954 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
13955 SDValue N0 = N->getOperand(0);
13956 SDValue N1 = N->getOperand(1);
13957 EVT VT = N->getValueType(0);
13960 const TargetOptions &Options = DAG.getTarget().Options;
13962 // Floating-point multiply-add with intermediate rounding.
13963 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
13965 // Floating-point multiply-add without intermediate rounding.
13967 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
13968 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
13970 // No valid opcode, do not combine.
13971 if (!HasFMAD && !HasFMA)
13974 bool CanReassociate =
13975 Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
13976 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
13977 Options.UnsafeFPMath || HasFMAD);
13978 // If the addition is not contractable, do not combine.
13979 if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
13982 if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
13985 // Always prefer FMAD to FMA for precision.
13986 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
13987 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
13989 auto isFusedOp = [&](SDValue N) {
13990 unsigned Opcode = N.getOpcode();
13991 return Opcode == ISD::FMA || Opcode == ISD::FMAD;
13994 // Is the node an FMUL and contractable either due to global flags or
13996 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
13997 if (N.getOpcode() != ISD::FMUL)
13999 return AllowFusionGlobally || N->getFlags().hasAllowContract();
14001 // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
14002 // prefer to fold the multiply with fewer uses.
14003 if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
14004 if (N0->use_size() > N1->use_size())
14008 // fold (fadd (fmul x, y), z) -> (fma x, y, z)
14009 if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
14010 return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
14011 N0.getOperand(1), N1);
14014 // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
14015 // Note: Commutes FADD operands.
14016 if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
14017 return DAG.getNode(PreferredFusedOpcode, SL, VT, N1.getOperand(0),
14018 N1.getOperand(1), N0);
14021 // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)
14022 // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)
14023 // This requires reassociation because it changes the order of operations.
14025 if (CanReassociate && isFusedOp(N0) &&
14026 N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() &&
14027 N0.getOperand(2).hasOneUse()) {
14030 } else if (CanReassociate && isFusedOp(N1) &&
14031 N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() &&
14032 N1.getOperand(2).hasOneUse()) {
14037 SDValue A = FMA.getOperand(0);
14038 SDValue B = FMA.getOperand(1);
14039 SDValue C = FMA.getOperand(2).getOperand(0);
14040 SDValue D = FMA.getOperand(2).getOperand(1);
14041 SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E);
14042 return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE);
14045 // Look through FP_EXTEND nodes to do more combining.
14047 // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
14048 if (N0.getOpcode() == ISD::FP_EXTEND) {
14049 SDValue N00 = N0.getOperand(0);
14050 if (isContractableFMUL(N00) &&
14051 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14052 N00.getValueType())) {
14053 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14054 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
14055 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
14060 // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
14061 // Note: Commutes FADD operands.
14062 if (N1.getOpcode() == ISD::FP_EXTEND) {
14063 SDValue N10 = N1.getOperand(0);
14064 if (isContractableFMUL(N10) &&
14065 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14066 N10.getValueType())) {
14067 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14068 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0)),
14069 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)),
14074 // More folding opportunities when target permits.
14076 // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
14077 // -> (fma x, y, (fma (fpext u), (fpext v), z))
14078 auto FoldFAddFMAFPExtFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
14080 return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
14081 DAG.getNode(PreferredFusedOpcode, SL, VT,
14082 DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
14083 DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
14086 if (isFusedOp(N0)) {
14087 SDValue N02 = N0.getOperand(2);
14088 if (N02.getOpcode() == ISD::FP_EXTEND) {
14089 SDValue N020 = N02.getOperand(0);
14090 if (isContractableFMUL(N020) &&
14091 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14092 N020.getValueType())) {
14093 return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
14094 N020.getOperand(0), N020.getOperand(1),
14100 // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
14101 // -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
14102 // FIXME: This turns two single-precision and one double-precision
14103 // operation into two double-precision operations, which might not be
14104 // interesting for all targets, especially GPUs.
14105 auto FoldFAddFPExtFMAFMul = [&](SDValue X, SDValue Y, SDValue U, SDValue V,
14107 return DAG.getNode(
14108 PreferredFusedOpcode, SL, VT, DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
14109 DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
14110 DAG.getNode(PreferredFusedOpcode, SL, VT,
14111 DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
14112 DAG.getNode(ISD::FP_EXTEND, SL, VT, V), Z));
14114 if (N0.getOpcode() == ISD::FP_EXTEND) {
14115 SDValue N00 = N0.getOperand(0);
14116 if (isFusedOp(N00)) {
14117 SDValue N002 = N00.getOperand(2);
14118 if (isContractableFMUL(N002) &&
14119 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14120 N00.getValueType())) {
14121 return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
14122 N002.getOperand(0), N002.getOperand(1),
14128 // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
14129 // -> (fma y, z, (fma (fpext u), (fpext v), x))
14130 if (isFusedOp(N1)) {
14131 SDValue N12 = N1.getOperand(2);
14132 if (N12.getOpcode() == ISD::FP_EXTEND) {
14133 SDValue N120 = N12.getOperand(0);
14134 if (isContractableFMUL(N120) &&
14135 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14136 N120.getValueType())) {
14137 return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
14138 N120.getOperand(0), N120.getOperand(1),
14144 // fold (fadd x, (fpext (fma y, z, (fmul u, v)))
14145 // -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
14146 // FIXME: This turns two single-precision and one double-precision
14147 // operation into two double-precision operations, which might not be
14148 // interesting for all targets, especially GPUs.
14149 if (N1.getOpcode() == ISD::FP_EXTEND) {
14150 SDValue N10 = N1.getOperand(0);
14151 if (isFusedOp(N10)) {
14152 SDValue N102 = N10.getOperand(2);
14153 if (isContractableFMUL(N102) &&
14154 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14155 N10.getValueType())) {
14156 return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
14157 N102.getOperand(0), N102.getOperand(1),
14167 /// Try to perform FMA combining on a given FSUB node.
14168 SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
14169 SDValue N0 = N->getOperand(0);
14170 SDValue N1 = N->getOperand(1);
14171 EVT VT = N->getValueType(0);
14174 const TargetOptions &Options = DAG.getTarget().Options;
14175 // Floating-point multiply-add with intermediate rounding.
14176 bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
14178 // Floating-point multiply-add without intermediate rounding.
14180 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
14181 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
14183 // No valid opcode, do not combine.
14184 if (!HasFMAD && !HasFMA)
14187 const SDNodeFlags Flags = N->getFlags();
14188 bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
14189 Options.UnsafeFPMath || HasFMAD);
14191 // If the subtraction is not contractable, do not combine.
14192 if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
14195 if (TLI.generateFMAsInMachineCombiner(VT, OptLevel))
14198 // Always prefer FMAD to FMA for precision.
14199 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
14200 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
14201 bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
14203 // Is the node an FMUL and contractable either due to global flags or
14205 auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
14206 if (N.getOpcode() != ISD::FMUL)
14208 return AllowFusionGlobally || N->getFlags().hasAllowContract();
14211 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
14212 auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
14213 if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
14214 return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
14215 XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z));
14220 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
14221 // Note: Commutes FSUB operands.
14222 auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
14223 if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
14224 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14225 DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
14226 YZ.getOperand(1), X);
14231 // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
14232 // prefer to fold the multiply with fewer uses.
14233 if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
14234 (N0->use_size() > N1->use_size())) {
14235 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
14236 if (SDValue V = tryToFoldXSubYZ(N0, N1))
14238 // fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d)))
14239 if (SDValue V = tryToFoldXYSubZ(N0, N1))
14242 // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
14243 if (SDValue V = tryToFoldXYSubZ(N0, N1))
14245 // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
14246 if (SDValue V = tryToFoldXSubYZ(N0, N1))
14250 // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
14251 if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
14252 (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
14253 SDValue N00 = N0.getOperand(0).getOperand(0);
14254 SDValue N01 = N0.getOperand(0).getOperand(1);
14255 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14256 DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
14257 DAG.getNode(ISD::FNEG, SL, VT, N1));
14260 // Look through FP_EXTEND nodes to do more combining.
14262 // fold (fsub (fpext (fmul x, y)), z)
14263 // -> (fma (fpext x), (fpext y), (fneg z))
14264 if (N0.getOpcode() == ISD::FP_EXTEND) {
14265 SDValue N00 = N0.getOperand(0);
14266 if (isContractableFMUL(N00) &&
14267 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14268 N00.getValueType())) {
14269 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14270 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
14271 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
14272 DAG.getNode(ISD::FNEG, SL, VT, N1));
14276 // fold (fsub x, (fpext (fmul y, z)))
14277 // -> (fma (fneg (fpext y)), (fpext z), x)
14278 // Note: Commutes FSUB operands.
14279 if (N1.getOpcode() == ISD::FP_EXTEND) {
14280 SDValue N10 = N1.getOperand(0);
14281 if (isContractableFMUL(N10) &&
14282 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14283 N10.getValueType())) {
14284 return DAG.getNode(
14285 PreferredFusedOpcode, SL, VT,
14286 DAG.getNode(ISD::FNEG, SL, VT,
14287 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(0))),
14288 DAG.getNode(ISD::FP_EXTEND, SL, VT, N10.getOperand(1)), N0);
14292 // fold (fsub (fpext (fneg (fmul, x, y))), z)
14293 // -> (fneg (fma (fpext x), (fpext y), z))
14294 // Note: This could be removed with appropriate canonicalization of the
14295 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
14296 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
14297 // from implementing the canonicalization in visitFSUB.
14298 if (N0.getOpcode() == ISD::FP_EXTEND) {
14299 SDValue N00 = N0.getOperand(0);
14300 if (N00.getOpcode() == ISD::FNEG) {
14301 SDValue N000 = N00.getOperand(0);
14302 if (isContractableFMUL(N000) &&
14303 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14304 N00.getValueType())) {
14305 return DAG.getNode(
14307 DAG.getNode(PreferredFusedOpcode, SL, VT,
14308 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
14309 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
14315 // fold (fsub (fneg (fpext (fmul, x, y))), z)
14316 // -> (fneg (fma (fpext x)), (fpext y), z)
14317 // Note: This could be removed with appropriate canonicalization of the
14318 // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
14319 // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
14320 // from implementing the canonicalization in visitFSUB.
14321 if (N0.getOpcode() == ISD::FNEG) {
14322 SDValue N00 = N0.getOperand(0);
14323 if (N00.getOpcode() == ISD::FP_EXTEND) {
14324 SDValue N000 = N00.getOperand(0);
14325 if (isContractableFMUL(N000) &&
14326 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14327 N000.getValueType())) {
14328 return DAG.getNode(
14330 DAG.getNode(PreferredFusedOpcode, SL, VT,
14331 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(0)),
14332 DAG.getNode(ISD::FP_EXTEND, SL, VT, N000.getOperand(1)),
14338 auto isReassociable = [Options](SDNode *N) {
14339 return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
14342 auto isContractableAndReassociableFMUL = [isContractableFMUL,
14343 isReassociable](SDValue N) {
14344 return isContractableFMUL(N) && isReassociable(N.getNode());
14347 auto isFusedOp = [&](SDValue N) {
14348 unsigned Opcode = N.getOpcode();
14349 return Opcode == ISD::FMA || Opcode == ISD::FMAD;
14352 // More folding opportunities when target permits.
14353 if (Aggressive && isReassociable(N)) {
14354 bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract();
14355 // fold (fsub (fma x, y, (fmul u, v)), z)
14356 // -> (fma x, y (fma u, v, (fneg z)))
14357 if (CanFuse && isFusedOp(N0) &&
14358 isContractableAndReassociableFMUL(N0.getOperand(2)) &&
14359 N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
14360 return DAG.getNode(PreferredFusedOpcode, SL, VT, N0.getOperand(0),
14362 DAG.getNode(PreferredFusedOpcode, SL, VT,
14363 N0.getOperand(2).getOperand(0),
14364 N0.getOperand(2).getOperand(1),
14365 DAG.getNode(ISD::FNEG, SL, VT, N1)));
14368 // fold (fsub x, (fma y, z, (fmul u, v)))
14369 // -> (fma (fneg y), z, (fma (fneg u), v, x))
14370 if (CanFuse && isFusedOp(N1) &&
14371 isContractableAndReassociableFMUL(N1.getOperand(2)) &&
14372 N1->hasOneUse() && NoSignedZero) {
14373 SDValue N20 = N1.getOperand(2).getOperand(0);
14374 SDValue N21 = N1.getOperand(2).getOperand(1);
14375 return DAG.getNode(
14376 PreferredFusedOpcode, SL, VT,
14377 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1),
14378 DAG.getNode(PreferredFusedOpcode, SL, VT,
14379 DAG.getNode(ISD::FNEG, SL, VT, N20), N21, N0));
14382 // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
14383 // -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
14384 if (isFusedOp(N0) && N0->hasOneUse()) {
14385 SDValue N02 = N0.getOperand(2);
14386 if (N02.getOpcode() == ISD::FP_EXTEND) {
14387 SDValue N020 = N02.getOperand(0);
14388 if (isContractableAndReassociableFMUL(N020) &&
14389 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14390 N020.getValueType())) {
14391 return DAG.getNode(
14392 PreferredFusedOpcode, SL, VT, N0.getOperand(0), N0.getOperand(1),
14394 PreferredFusedOpcode, SL, VT,
14395 DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(0)),
14396 DAG.getNode(ISD::FP_EXTEND, SL, VT, N020.getOperand(1)),
14397 DAG.getNode(ISD::FNEG, SL, VT, N1)));
14402 // fold (fsub (fpext (fma x, y, (fmul u, v))), z)
14403 // -> (fma (fpext x), (fpext y),
14404 // (fma (fpext u), (fpext v), (fneg z)))
14405 // FIXME: This turns two single-precision and one double-precision
14406 // operation into two double-precision operations, which might not be
14407 // interesting for all targets, especially GPUs.
14408 if (N0.getOpcode() == ISD::FP_EXTEND) {
14409 SDValue N00 = N0.getOperand(0);
14410 if (isFusedOp(N00)) {
14411 SDValue N002 = N00.getOperand(2);
14412 if (isContractableAndReassociableFMUL(N002) &&
14413 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14414 N00.getValueType())) {
14415 return DAG.getNode(
14416 PreferredFusedOpcode, SL, VT,
14417 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(0)),
14418 DAG.getNode(ISD::FP_EXTEND, SL, VT, N00.getOperand(1)),
14420 PreferredFusedOpcode, SL, VT,
14421 DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(0)),
14422 DAG.getNode(ISD::FP_EXTEND, SL, VT, N002.getOperand(1)),
14423 DAG.getNode(ISD::FNEG, SL, VT, N1)));
14428 // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
14429 // -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
14430 if (isFusedOp(N1) && N1.getOperand(2).getOpcode() == ISD::FP_EXTEND &&
14432 SDValue N120 = N1.getOperand(2).getOperand(0);
14433 if (isContractableAndReassociableFMUL(N120) &&
14434 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14435 N120.getValueType())) {
14436 SDValue N1200 = N120.getOperand(0);
14437 SDValue N1201 = N120.getOperand(1);
14438 return DAG.getNode(
14439 PreferredFusedOpcode, SL, VT,
14440 DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)), N1.getOperand(1),
14441 DAG.getNode(PreferredFusedOpcode, SL, VT,
14442 DAG.getNode(ISD::FNEG, SL, VT,
14443 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1200)),
14444 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1201), N0));
14448 // fold (fsub x, (fpext (fma y, z, (fmul u, v))))
14449 // -> (fma (fneg (fpext y)), (fpext z),
14450 // (fma (fneg (fpext u)), (fpext v), x))
14451 // FIXME: This turns two single-precision and one double-precision
14452 // operation into two double-precision operations, which might not be
14453 // interesting for all targets, especially GPUs.
14454 if (N1.getOpcode() == ISD::FP_EXTEND && isFusedOp(N1.getOperand(0))) {
14455 SDValue CvtSrc = N1.getOperand(0);
14456 SDValue N100 = CvtSrc.getOperand(0);
14457 SDValue N101 = CvtSrc.getOperand(1);
14458 SDValue N102 = CvtSrc.getOperand(2);
14459 if (isContractableAndReassociableFMUL(N102) &&
14460 TLI.isFPExtFoldable(DAG, PreferredFusedOpcode, VT,
14461 CvtSrc.getValueType())) {
14462 SDValue N1020 = N102.getOperand(0);
14463 SDValue N1021 = N102.getOperand(1);
14464 return DAG.getNode(
14465 PreferredFusedOpcode, SL, VT,
14466 DAG.getNode(ISD::FNEG, SL, VT,
14467 DAG.getNode(ISD::FP_EXTEND, SL, VT, N100)),
14468 DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
14469 DAG.getNode(PreferredFusedOpcode, SL, VT,
14470 DAG.getNode(ISD::FNEG, SL, VT,
14471 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1020)),
14472 DAG.getNode(ISD::FP_EXTEND, SL, VT, N1021), N0));
14480 /// Try to perform FMA combining on a given FMUL node based on the distributive
14481 /// law x * (y + 1) = x * y + x and variants thereof (commuted versions,
14482 /// subtraction instead of addition).
14483 SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
14484 SDValue N0 = N->getOperand(0);
14485 SDValue N1 = N->getOperand(1);
14486 EVT VT = N->getValueType(0);
14489 assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
14491 const TargetOptions &Options = DAG.getTarget().Options;
14493 // The transforms below are incorrect when x == 0 and y == inf, because the
14494 // intermediate multiplication produces a nan.
14495 SDValue FAdd = N0.getOpcode() == ISD::FADD ? N0 : N1;
14496 if (!hasNoInfs(Options, FAdd))
14499 // Floating-point multiply-add without intermediate rounding.
14501 isContractableFMUL(Options, SDValue(N, 0)) &&
14502 TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT) &&
14503 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
14505 // Floating-point multiply-add with intermediate rounding. This can result
14506 // in a less precise result due to the changed rounding order.
14507 bool HasFMAD = Options.UnsafeFPMath &&
14508 (LegalOperations && TLI.isFMADLegal(DAG, N));
14510 // No valid opcode, do not combine.
14511 if (!HasFMAD && !HasFMA)
14514 // Always prefer FMAD to FMA for precision.
14515 unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
14516 bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
14518 // fold (fmul (fadd x0, +1.0), y) -> (fma x0, y, y)
14519 // fold (fmul (fadd x0, -1.0), y) -> (fma x0, y, (fneg y))
14520 auto FuseFADD = [&](SDValue X, SDValue Y) {
14521 if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
14522 if (auto *C = isConstOrConstSplatFP(X.getOperand(1), true)) {
14523 if (C->isExactlyValue(+1.0))
14524 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14526 if (C->isExactlyValue(-1.0))
14527 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14528 DAG.getNode(ISD::FNEG, SL, VT, Y));
14534 if (SDValue FMA = FuseFADD(N0, N1))
14536 if (SDValue FMA = FuseFADD(N1, N0))
14539 // fold (fmul (fsub +1.0, x1), y) -> (fma (fneg x1), y, y)
14540 // fold (fmul (fsub -1.0, x1), y) -> (fma (fneg x1), y, (fneg y))
14541 // fold (fmul (fsub x0, +1.0), y) -> (fma x0, y, (fneg y))
14542 // fold (fmul (fsub x0, -1.0), y) -> (fma x0, y, y)
14543 auto FuseFSUB = [&](SDValue X, SDValue Y) {
14544 if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
14545 if (auto *C0 = isConstOrConstSplatFP(X.getOperand(0), true)) {
14546 if (C0->isExactlyValue(+1.0))
14547 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14548 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
14550 if (C0->isExactlyValue(-1.0))
14551 return DAG.getNode(PreferredFusedOpcode, SL, VT,
14552 DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
14553 DAG.getNode(ISD::FNEG, SL, VT, Y));
14555 if (auto *C1 = isConstOrConstSplatFP(X.getOperand(1), true)) {
14556 if (C1->isExactlyValue(+1.0))
14557 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14558 DAG.getNode(ISD::FNEG, SL, VT, Y));
14559 if (C1->isExactlyValue(-1.0))
14560 return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
14567 if (SDValue FMA = FuseFSUB(N0, N1))
14569 if (SDValue FMA = FuseFSUB(N1, N0))
14575 SDValue DAGCombiner::visitFADD(SDNode *N) {
14576 SDValue N0 = N->getOperand(0);
14577 SDValue N1 = N->getOperand(1);
14578 bool N0CFP = DAG.isConstantFPBuildVectorOrConstantFP(N0);
14579 bool N1CFP = DAG.isConstantFPBuildVectorOrConstantFP(N1);
14580 EVT VT = N->getValueType(0);
14582 const TargetOptions &Options = DAG.getTarget().Options;
14583 SDNodeFlags Flags = N->getFlags();
14584 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14586 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
14589 // fold (fadd c1, c2) -> c1 + c2
14590 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FADD, DL, VT, {N0, N1}))
14593 // canonicalize constant to RHS
14594 if (N0CFP && !N1CFP)
14595 return DAG.getNode(ISD::FADD, DL, VT, N1, N0);
14599 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
14602 // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
14603 ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
14604 if (N1C && N1C->isZero())
14605 if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())
14608 if (SDValue NewSel = foldBinOpIntoSelect(N))
14611 // fold (fadd A, (fneg B)) -> (fsub A, B)
14612 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
14613 if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
14614 N1, DAG, LegalOperations, ForCodeSize))
14615 return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1);
14617 // fold (fadd (fneg A), B) -> (fsub B, A)
14618 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
14619 if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
14620 N0, DAG, LegalOperations, ForCodeSize))
14621 return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0);
14623 auto isFMulNegTwo = [](SDValue FMul) {
14624 if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
14626 auto *C = isConstOrConstSplatFP(FMul.getOperand(1), true);
14627 return C && C->isExactlyValue(-2.0);
14630 // fadd (fmul B, -2.0), A --> fsub A, (fadd B, B)
14631 if (isFMulNegTwo(N0)) {
14632 SDValue B = N0.getOperand(0);
14633 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
14634 return DAG.getNode(ISD::FSUB, DL, VT, N1, Add);
14636 // fadd A, (fmul B, -2.0) --> fsub A, (fadd B, B)
14637 if (isFMulNegTwo(N1)) {
14638 SDValue B = N1.getOperand(0);
14639 SDValue Add = DAG.getNode(ISD::FADD, DL, VT, B, B);
14640 return DAG.getNode(ISD::FSUB, DL, VT, N0, Add);
14643 // No FP constant should be created after legalization as Instruction
14644 // Selection pass has a hard time dealing with FP constants.
14645 bool AllowNewConst = (Level < AfterLegalizeDAG);
14647 // If nnan is enabled, fold lots of things.
14648 if ((Options.NoNaNsFPMath || Flags.hasNoNaNs()) && AllowNewConst) {
14649 // If allowed, fold (fadd (fneg x), x) -> 0.0
14650 if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
14651 return DAG.getConstantFP(0.0, DL, VT);
14653 // If allowed, fold (fadd x, (fneg x)) -> 0.0
14654 if (N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
14655 return DAG.getConstantFP(0.0, DL, VT);
14658 // If 'unsafe math' or reassoc and nsz, fold lots of things.
14659 // TODO: break out portions of the transformations below for which Unsafe is
14660 // considered and which do not require both nsz and reassoc
14661 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
14662 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
14664 // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
14665 if (N1CFP && N0.getOpcode() == ISD::FADD &&
14666 DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
14667 SDValue NewC = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1);
14668 return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0), NewC);
14671 // We can fold chains of FADD's of the same value into multiplications.
14672 // This transform is not safe in general because we are reducing the number
14673 // of rounding steps.
14674 if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
14675 if (N0.getOpcode() == ISD::FMUL) {
14676 bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
14677 bool CFP01 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
14679 // (fadd (fmul x, c), x) -> (fmul x, c+1)
14680 if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
14681 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
14682 DAG.getConstantFP(1.0, DL, VT));
14683 return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP);
14686 // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
14687 if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
14688 N1.getOperand(0) == N1.getOperand(1) &&
14689 N0.getOperand(0) == N1.getOperand(0)) {
14690 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
14691 DAG.getConstantFP(2.0, DL, VT));
14692 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP);
14696 if (N1.getOpcode() == ISD::FMUL) {
14697 bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
14698 bool CFP11 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
14700 // (fadd x, (fmul x, c)) -> (fmul x, c+1)
14701 if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
14702 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
14703 DAG.getConstantFP(1.0, DL, VT));
14704 return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP);
14707 // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
14708 if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
14709 N0.getOperand(0) == N0.getOperand(1) &&
14710 N1.getOperand(0) == N0.getOperand(0)) {
14711 SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
14712 DAG.getConstantFP(2.0, DL, VT));
14713 return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP);
14717 if (N0.getOpcode() == ISD::FADD) {
14718 bool CFP00 = DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
14719 // (fadd (fadd x, x), x) -> (fmul x, 3.0)
14720 if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
14721 (N0.getOperand(0) == N1)) {
14722 return DAG.getNode(ISD::FMUL, DL, VT, N1,
14723 DAG.getConstantFP(3.0, DL, VT));
14727 if (N1.getOpcode() == ISD::FADD) {
14728 bool CFP10 = DAG.isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
14729 // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
14730 if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
14731 N1.getOperand(0) == N0) {
14732 return DAG.getNode(ISD::FMUL, DL, VT, N0,
14733 DAG.getConstantFP(3.0, DL, VT));
14737 // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
14738 if (N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
14739 N0.getOperand(0) == N0.getOperand(1) &&
14740 N1.getOperand(0) == N1.getOperand(1) &&
14741 N0.getOperand(0) == N1.getOperand(0)) {
14742 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
14743 DAG.getConstantFP(4.0, DL, VT));
14746 } // enable-unsafe-fp-math
14748 // FADD -> FMA combines:
14749 if (SDValue Fused = visitFADDForFMACombine(N)) {
14750 AddToWorklist(Fused.getNode());
14756 SDValue DAGCombiner::visitSTRICT_FADD(SDNode *N) {
14757 SDValue Chain = N->getOperand(0);
14758 SDValue N0 = N->getOperand(1);
14759 SDValue N1 = N->getOperand(2);
14760 EVT VT = N->getValueType(0);
14761 EVT ChainVT = N->getValueType(1);
14763 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14765 // fold (strict_fadd A, (fneg B)) -> (strict_fsub A, B)
14766 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
14767 if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
14768 N1, DAG, LegalOperations, ForCodeSize)) {
14769 return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
14770 {Chain, N0, NegN1});
14773 // fold (strict_fadd (fneg A), B) -> (strict_fsub B, A)
14774 if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::STRICT_FSUB, VT))
14775 if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
14776 N0, DAG, LegalOperations, ForCodeSize)) {
14777 return DAG.getNode(ISD::STRICT_FSUB, DL, DAG.getVTList(VT, ChainVT),
14778 {Chain, N1, NegN0});
14783 SDValue DAGCombiner::visitFSUB(SDNode *N) {
14784 SDValue N0 = N->getOperand(0);
14785 SDValue N1 = N->getOperand(1);
14786 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, true);
14787 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
14788 EVT VT = N->getValueType(0);
14790 const TargetOptions &Options = DAG.getTarget().Options;
14791 const SDNodeFlags Flags = N->getFlags();
14792 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14794 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
14797 // fold (fsub c1, c2) -> c1-c2
14798 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FSUB, DL, VT, {N0, N1}))
14803 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
14806 if (SDValue NewSel = foldBinOpIntoSelect(N))
14809 // (fsub A, 0) -> A
14810 if (N1CFP && N1CFP->isZero()) {
14811 if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
14812 Flags.hasNoSignedZeros()) {
14818 // (fsub x, x) -> 0.0
14819 if (Options.NoNaNsFPMath || Flags.hasNoNaNs())
14820 return DAG.getConstantFP(0.0f, DL, VT);
14823 // (fsub -0.0, N1) -> -N1
14824 if (N0CFP && N0CFP->isZero()) {
14825 if (N0CFP->isNegative() ||
14826 (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
14827 // We cannot replace an FSUB(+-0.0,X) with FNEG(X) when denormals are
14828 // flushed to zero, unless all users treat denorms as zero (DAZ).
14829 // FIXME: This transform will change the sign of a NaN and the behavior
14830 // of a signaling NaN. It is only valid when a NoNaN flag is present.
14831 DenormalMode DenormMode = DAG.getDenormalMode(VT);
14832 if (DenormMode == DenormalMode::getIEEE()) {
14833 if (SDValue NegN1 =
14834 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
14836 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
14837 return DAG.getNode(ISD::FNEG, DL, VT, N1);
14842 if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
14843 (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
14844 N1.getOpcode() == ISD::FADD) {
14845 // X - (X + Y) -> -Y
14846 if (N0 == N1->getOperand(0))
14847 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1));
14848 // X - (Y + X) -> -Y
14849 if (N0 == N1->getOperand(1))
14850 return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(0));
14853 // fold (fsub A, (fneg B)) -> (fadd A, B)
14854 if (SDValue NegN1 =
14855 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
14856 return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1);
14858 // FSUB -> FMA combines:
14859 if (SDValue Fused = visitFSUBForFMACombine(N)) {
14860 AddToWorklist(Fused.getNode());
14867 SDValue DAGCombiner::visitFMUL(SDNode *N) {
14868 SDValue N0 = N->getOperand(0);
14869 SDValue N1 = N->getOperand(1);
14870 ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1, true);
14871 EVT VT = N->getValueType(0);
14873 const TargetOptions &Options = DAG.getTarget().Options;
14874 const SDNodeFlags Flags = N->getFlags();
14875 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
14877 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
14880 // fold (fmul c1, c2) -> c1*c2
14881 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FMUL, DL, VT, {N0, N1}))
14884 // canonicalize constant to RHS
14885 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
14886 !DAG.isConstantFPBuildVectorOrConstantFP(N1))
14887 return DAG.getNode(ISD::FMUL, DL, VT, N1, N0);
14891 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
14894 if (SDValue NewSel = foldBinOpIntoSelect(N))
14897 if (Options.UnsafeFPMath || Flags.hasAllowReassociation()) {
14898 // fmul (fmul X, C1), C2 -> fmul X, C1 * C2
14899 if (DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
14900 N0.getOpcode() == ISD::FMUL) {
14901 SDValue N00 = N0.getOperand(0);
14902 SDValue N01 = N0.getOperand(1);
14903 // Avoid an infinite loop by making sure that N00 is not a constant
14904 // (the inner multiply has not been constant folded yet).
14905 if (DAG.isConstantFPBuildVectorOrConstantFP(N01) &&
14906 !DAG.isConstantFPBuildVectorOrConstantFP(N00)) {
14907 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1);
14908 return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts);
14912 // Match a special-case: we convert X * 2.0 into fadd.
14913 // fmul (fadd X, X), C -> fmul X, 2.0 * C
14914 if (N0.getOpcode() == ISD::FADD && N0.hasOneUse() &&
14915 N0.getOperand(0) == N0.getOperand(1)) {
14916 const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
14917 SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1);
14918 return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts);
14922 // fold (fmul X, 2.0) -> (fadd X, X)
14923 if (N1CFP && N1CFP->isExactlyValue(+2.0))
14924 return DAG.getNode(ISD::FADD, DL, VT, N0, N0);
14926 // fold (fmul X, -1.0) -> (fsub -0.0, X)
14927 if (N1CFP && N1CFP->isExactlyValue(-1.0)) {
14928 if (!LegalOperations || TLI.isOperationLegal(ISD::FSUB, VT)) {
14929 return DAG.getNode(ISD::FSUB, DL, VT,
14930 DAG.getConstantFP(-0.0, DL, VT), N0, Flags);
14934 // -N0 * -N1 --> N0 * N1
14935 TargetLowering::NegatibleCost CostN0 =
14936 TargetLowering::NegatibleCost::Expensive;
14937 TargetLowering::NegatibleCost CostN1 =
14938 TargetLowering::NegatibleCost::Expensive;
14940 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
14942 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
14943 if (NegN0 && NegN1 &&
14944 (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
14945 CostN1 == TargetLowering::NegatibleCost::Cheaper))
14946 return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1);
14948 // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
14949 // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
14950 if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() &&
14951 (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) &&
14952 TLI.isOperationLegal(ISD::FABS, VT)) {
14953 SDValue Select = N0, X = N1;
14954 if (Select.getOpcode() != ISD::SELECT)
14955 std::swap(Select, X);
14957 SDValue Cond = Select.getOperand(0);
14958 auto TrueOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(1));
14959 auto FalseOpnd = dyn_cast<ConstantFPSDNode>(Select.getOperand(2));
14961 if (TrueOpnd && FalseOpnd &&
14962 Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X &&
14963 isa<ConstantFPSDNode>(Cond.getOperand(1)) &&
14964 cast<ConstantFPSDNode>(Cond.getOperand(1))->isExactlyValue(0.0)) {
14965 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
14974 std::swap(TrueOpnd, FalseOpnd);
14982 if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) &&
14983 TLI.isOperationLegal(ISD::FNEG, VT))
14984 return DAG.getNode(ISD::FNEG, DL, VT,
14985 DAG.getNode(ISD::FABS, DL, VT, X));
14986 if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0))
14987 return DAG.getNode(ISD::FABS, DL, VT, X);
14994 // FMUL -> FMA combines:
14995 if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) {
14996 AddToWorklist(Fused.getNode());
15003 SDValue DAGCombiner::visitFMA(SDNode *N) {
15004 SDValue N0 = N->getOperand(0);
15005 SDValue N1 = N->getOperand(1);
15006 SDValue N2 = N->getOperand(2);
15007 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
15008 ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
15009 EVT VT = N->getValueType(0);
15011 const TargetOptions &Options = DAG.getTarget().Options;
15012 // FMA nodes have flags that propagate to the created nodes.
15013 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15015 bool CanReassociate =
15016 Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
15018 // Constant fold FMA.
15019 if (isa<ConstantFPSDNode>(N0) &&
15020 isa<ConstantFPSDNode>(N1) &&
15021 isa<ConstantFPSDNode>(N2)) {
15022 return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
15025 // (-N0 * -N1) + N2 --> (N0 * N1) + N2
15026 TargetLowering::NegatibleCost CostN0 =
15027 TargetLowering::NegatibleCost::Expensive;
15028 TargetLowering::NegatibleCost CostN1 =
15029 TargetLowering::NegatibleCost::Expensive;
15031 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
15033 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
15034 if (NegN0 && NegN1 &&
15035 (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
15036 CostN1 == TargetLowering::NegatibleCost::Cheaper))
15037 return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2);
15039 // FIXME: use fast math flags instead of Options.UnsafeFPMath
15040 if (Options.UnsafeFPMath) {
15041 if (N0CFP && N0CFP->isZero())
15043 if (N1CFP && N1CFP->isZero())
15047 if (N0CFP && N0CFP->isExactlyValue(1.0))
15048 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
15049 if (N1CFP && N1CFP->isExactlyValue(1.0))
15050 return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
15052 // Canonicalize (fma c, x, y) -> (fma x, c, y)
15053 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
15054 !DAG.isConstantFPBuildVectorOrConstantFP(N1))
15055 return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
15057 if (CanReassociate) {
15058 // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
15059 if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
15060 DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
15061 DAG.isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
15062 return DAG.getNode(ISD::FMUL, DL, VT, N0,
15063 DAG.getNode(ISD::FADD, DL, VT, N1, N2.getOperand(1)));
15066 // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
15067 if (N0.getOpcode() == ISD::FMUL &&
15068 DAG.isConstantFPBuildVectorOrConstantFP(N1) &&
15069 DAG.isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
15070 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
15071 DAG.getNode(ISD::FMUL, DL, VT, N1, N0.getOperand(1)),
15076 // (fma x, -1, y) -> (fadd (fneg x), y)
15078 if (N1CFP->isExactlyValue(1.0))
15079 return DAG.getNode(ISD::FADD, DL, VT, N0, N2);
15081 if (N1CFP->isExactlyValue(-1.0) &&
15082 (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
15083 SDValue RHSNeg = DAG.getNode(ISD::FNEG, DL, VT, N0);
15084 AddToWorklist(RHSNeg.getNode());
15085 return DAG.getNode(ISD::FADD, DL, VT, N2, RHSNeg);
15088 // fma (fneg x), K, y -> fma x -K, y
15089 if (N0.getOpcode() == ISD::FNEG &&
15090 (TLI.isOperationLegal(ISD::ConstantFP, VT) ||
15091 (N1.hasOneUse() && !TLI.isFPImmLegal(N1CFP->getValueAPF(), VT,
15093 return DAG.getNode(ISD::FMA, DL, VT, N0.getOperand(0),
15094 DAG.getNode(ISD::FNEG, DL, VT, N1), N2);
15098 if (CanReassociate) {
15099 // (fma x, c, x) -> (fmul x, (c+1))
15100 if (N1CFP && N0 == N2) {
15101 return DAG.getNode(
15102 ISD::FMUL, DL, VT, N0,
15103 DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(1.0, DL, VT)));
15106 // (fma x, c, (fneg x)) -> (fmul x, (c-1))
15107 if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
15108 return DAG.getNode(
15109 ISD::FMUL, DL, VT, N0,
15110 DAG.getNode(ISD::FADD, DL, VT, N1, DAG.getConstantFP(-1.0, DL, VT)));
15114 // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
15115 // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
15116 if (!TLI.isFNegFree(VT))
15117 if (SDValue Neg = TLI.getCheaperNegatedExpression(
15118 SDValue(N, 0), DAG, LegalOperations, ForCodeSize))
15119 return DAG.getNode(ISD::FNEG, DL, VT, Neg);
15123 // Combine multiple FDIVs with the same divisor into multiple FMULs by the
15125 // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
15126 // Notice that this is not always beneficial. One reason is different targets
15127 // may have different costs for FDIV and FMUL, so sometimes the cost of two
15128 // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
15129 // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
15130 SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
15131 // TODO: Limit this transform based on optsize/minsize - it always creates at
15132 // least 1 extra instruction. But the perf win may be substantial enough
15133 // that only minsize should restrict this.
15134 bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
15135 const SDNodeFlags Flags = N->getFlags();
15136 if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal()))
15139 // Skip if current node is a reciprocal/fneg-reciprocal.
15140 SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
15141 ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0, /* AllowUndefs */ true);
15142 if (N0CFP && (N0CFP->isExactlyValue(1.0) || N0CFP->isExactlyValue(-1.0)))
15145 // Exit early if the target does not want this transform or if there can't
15146 // possibly be enough uses of the divisor to make the transform worthwhile.
15147 unsigned MinUses = TLI.combineRepeatedFPDivisors();
15149 // For splat vectors, scale the number of uses by the splat factor. If we can
15150 // convert the division into a scalar op, that will likely be much faster.
15151 unsigned NumElts = 1;
15152 EVT VT = N->getValueType(0);
15153 if (VT.isVector() && DAG.isSplatValue(N1))
15154 NumElts = VT.getVectorMinNumElements();
15156 if (!MinUses || (N1->use_size() * NumElts) < MinUses)
15159 // Find all FDIV users of the same divisor.
15160 // Use a set because duplicates may be present in the user list.
15161 SetVector<SDNode *> Users;
15162 for (auto *U : N1->uses()) {
15163 if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
15164 // Skip X/sqrt(X) that has not been simplified to sqrt(X) yet.
15165 if (U->getOperand(1).getOpcode() == ISD::FSQRT &&
15166 U->getOperand(0) == U->getOperand(1).getOperand(0) &&
15167 U->getFlags().hasAllowReassociation() &&
15168 U->getFlags().hasNoSignedZeros())
15171 // This division is eligible for optimization only if global unsafe math
15172 // is enabled or if this division allows reciprocal formation.
15173 if (UnsafeMath || U->getFlags().hasAllowReciprocal())
15178 // Now that we have the actual number of divisor uses, make sure it meets
15179 // the minimum threshold specified by the target.
15180 if ((Users.size() * NumElts) < MinUses)
15184 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
15185 SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);
15187 // Dividend / Divisor -> Dividend * Reciprocal
15188 for (auto *U : Users) {
15189 SDValue Dividend = U->getOperand(0);
15190 if (Dividend != FPOne) {
15191 SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
15192 Reciprocal, Flags);
15193 CombineTo(U, NewNode);
15194 } else if (U != Reciprocal.getNode()) {
15195 // In the absence of fast-math-flags, this user node is always the
15196 // same node as Reciprocal, but with FMF they may be different nodes.
15197 CombineTo(U, Reciprocal);
15200 return SDValue(N, 0); // N was replaced.
15203 SDValue DAGCombiner::visitFDIV(SDNode *N) {
15204 SDValue N0 = N->getOperand(0);
15205 SDValue N1 = N->getOperand(1);
15206 EVT VT = N->getValueType(0);
15208 const TargetOptions &Options = DAG.getTarget().Options;
15209 SDNodeFlags Flags = N->getFlags();
15210 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15212 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
15215 // fold (fdiv c1, c2) -> c1/c2
15216 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FDIV, DL, VT, {N0, N1}))
15221 if (SDValue FoldedVOp = SimplifyVBinOp(N, DL))
15224 if (SDValue NewSel = foldBinOpIntoSelect(N))
15227 if (SDValue V = combineRepeatedFPDivisors(N))
15230 if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
15231 // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
15232 if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
15233 // Compute the reciprocal 1.0 / c2.
15234 const APFloat &N1APF = N1CFP->getValueAPF();
15235 APFloat Recip(N1APF.getSemantics(), 1); // 1.0
15236 APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
15237 // Only do the transform if the reciprocal is a legal fp immediate that
15238 // isn't too nasty (eg NaN, denormal, ...).
15239 if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
15240 (!LegalOperations ||
15241 // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
15242 // backend)... we should handle this gracefully after Legalize.
15243 // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
15244 TLI.isOperationLegal(ISD::ConstantFP, VT) ||
15245 TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
15246 return DAG.getNode(ISD::FMUL, DL, VT, N0,
15247 DAG.getConstantFP(Recip, DL, VT));
15250 // If this FDIV is part of a reciprocal square root, it may be folded
15251 // into a target-specific square root estimate instruction.
15252 if (N1.getOpcode() == ISD::FSQRT) {
15253 if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
15254 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
15255 } else if (N1.getOpcode() == ISD::FP_EXTEND &&
15256 N1.getOperand(0).getOpcode() == ISD::FSQRT) {
15258 buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
15259 RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
15260 AddToWorklist(RV.getNode());
15261 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
15263 } else if (N1.getOpcode() == ISD::FP_ROUND &&
15264 N1.getOperand(0).getOpcode() == ISD::FSQRT) {
15266 buildRsqrtEstimate(N1.getOperand(0).getOperand(0), Flags)) {
15267 RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
15268 AddToWorklist(RV.getNode());
15269 return DAG.getNode(ISD::FMUL, DL, VT, N0, RV);
15271 } else if (N1.getOpcode() == ISD::FMUL) {
15272 // Look through an FMUL. Even though this won't remove the FDIV directly,
15273 // it's still worthwhile to get rid of the FSQRT if possible.
15275 if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
15276 Sqrt = N1.getOperand(0);
15277 Y = N1.getOperand(1);
15278 } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
15279 Sqrt = N1.getOperand(1);
15280 Y = N1.getOperand(0);
15282 if (Sqrt.getNode()) {
15283 // If the other multiply operand is known positive, pull it into the
15284 // sqrt. That will eliminate the division if we convert to an estimate.
15285 if (Flags.hasAllowReassociation() && N1.hasOneUse() &&
15286 N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse()) {
15288 if (Y.getOpcode() == ISD::FABS && Y.hasOneUse())
15289 A = Y.getOperand(0);
15290 else if (Y == Sqrt.getOperand(0))
15293 // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)
15294 // X / (A * sqrt(A)) --> X / sqrt(A*A*A) --> X * rsqrt(A*A*A)
15295 SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, A, A);
15297 DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0));
15298 if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
15299 return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt);
15301 // Estimate creation failed. Clean up speculatively created nodes.
15302 recursivelyDeleteUnusedNodes(AAZ.getNode());
15306 // We found a FSQRT, so try to make this fold:
15307 // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
15308 if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
15309 SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y);
15310 AddToWorklist(Div.getNode());
15311 return DAG.getNode(ISD::FMUL, DL, VT, N0, Div);
15316 // Fold into a reciprocal estimate and multiply instead of a real divide.
15317 if (Options.NoInfsFPMath || Flags.hasNoInfs())
15318 if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
15322 // Fold X/Sqrt(X) -> Sqrt(X)
15323 if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
15324 (Options.UnsafeFPMath || Flags.hasAllowReassociation()))
15325 if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
15328 // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
15329 TargetLowering::NegatibleCost CostN0 =
15330 TargetLowering::NegatibleCost::Expensive;
15331 TargetLowering::NegatibleCost CostN1 =
15332 TargetLowering::NegatibleCost::Expensive;
15334 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
15336 TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
15337 if (NegN0 && NegN1 &&
15338 (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
15339 CostN1 == TargetLowering::NegatibleCost::Cheaper))
15340 return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1);
15345 SDValue DAGCombiner::visitFREM(SDNode *N) {
15346 SDValue N0 = N->getOperand(0);
15347 SDValue N1 = N->getOperand(1);
15348 EVT VT = N->getValueType(0);
15349 SDNodeFlags Flags = N->getFlags();
15350 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15352 if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
15355 // fold (frem c1, c2) -> fmod(c1,c2)
15356 if (SDValue C = DAG.FoldConstantArithmetic(ISD::FREM, SDLoc(N), VT, {N0, N1}))
15359 if (SDValue NewSel = foldBinOpIntoSelect(N))
15365 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
15366 SDNodeFlags Flags = N->getFlags();
15367 const TargetOptions &Options = DAG.getTarget().Options;
15369 // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
15370 // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
15371 if (!Flags.hasApproximateFuncs() ||
15372 (!Options.NoInfsFPMath && !Flags.hasNoInfs()))
15375 SDValue N0 = N->getOperand(0);
15376 if (TLI.isFsqrtCheap(N0, DAG))
15379 // FSQRT nodes have flags that propagate to the created nodes.
15380 // TODO: If this is N0/sqrt(N0), and we reach this node before trying to
15381 // transform the fdiv, we may produce a sub-optimal estimate sequence
15382 // because the reciprocal calculation may not have to filter out a
15384 return buildSqrtEstimate(N0, Flags);
15387 /// copysign(x, fp_extend(y)) -> copysign(x, y)
15388 /// copysign(x, fp_round(y)) -> copysign(x, y)
15389 static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
15390 SDValue N1 = N->getOperand(1);
15391 if ((N1.getOpcode() == ISD::FP_EXTEND ||
15392 N1.getOpcode() == ISD::FP_ROUND)) {
15393 EVT N1VT = N1->getValueType(0);
15394 EVT N1Op0VT = N1->getOperand(0).getValueType();
15396 // Always fold no-op FP casts.
15397 if (N1VT == N1Op0VT)
15400 // Do not optimize out type conversion of f128 type yet.
15401 // For some targets like x86_64, configuration is changed to keep one f128
15402 // value in one SSE register, but instruction selection cannot handle
15403 // FCOPYSIGN on SSE registers yet.
15404 if (N1Op0VT == MVT::f128)
15407 // Avoid mismatched vector operand types, for better instruction selection.
15408 if (N1Op0VT.isVector())
15416 SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
15417 SDValue N0 = N->getOperand(0);
15418 SDValue N1 = N->getOperand(1);
15419 EVT VT = N->getValueType(0);
15421 // fold (fcopysign c1, c2) -> fcopysign(c1,c2)
15423 DAG.FoldConstantArithmetic(ISD::FCOPYSIGN, SDLoc(N), VT, {N0, N1}))
15426 if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N->getOperand(1))) {
15427 const APFloat &V = N1C->getValueAPF();
15428 // copysign(x, c1) -> fabs(x) iff ispos(c1)
15429 // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
15430 if (!V.isNegative()) {
15431 if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
15432 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
15434 if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
15435 return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
15436 DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
15440 // copysign(fabs(x), y) -> copysign(x, y)
15441 // copysign(fneg(x), y) -> copysign(x, y)
15442 // copysign(copysign(x,z), y) -> copysign(x, y)
15443 if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
15444 N0.getOpcode() == ISD::FCOPYSIGN)
15445 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0.getOperand(0), N1);
15447 // copysign(x, abs(y)) -> abs(x)
15448 if (N1.getOpcode() == ISD::FABS)
15449 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
15451 // copysign(x, copysign(y,z)) -> copysign(x, z)
15452 if (N1.getOpcode() == ISD::FCOPYSIGN)
15453 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(1));
15455 // copysign(x, fp_extend(y)) -> copysign(x, y)
15456 // copysign(x, fp_round(y)) -> copysign(x, y)
15457 if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
15458 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1.getOperand(0));
15463 SDValue DAGCombiner::visitFPOW(SDNode *N) {
15464 ConstantFPSDNode *ExponentC = isConstOrConstSplatFP(N->getOperand(1));
15467 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15469 // Try to convert x ** (1/3) into cube root.
15470 // TODO: Handle the various flavors of long double.
15471 // TODO: Since we're approximating, we don't need an exact 1/3 exponent.
15472 // Some range near 1/3 should be fine.
15473 EVT VT = N->getValueType(0);
15474 if ((VT == MVT::f32 && ExponentC->getValueAPF().isExactlyValue(1.0f/3.0f)) ||
15475 (VT == MVT::f64 && ExponentC->getValueAPF().isExactlyValue(1.0/3.0))) {
15476 // pow(-0.0, 1/3) = +0.0; cbrt(-0.0) = -0.0.
15477 // pow(-inf, 1/3) = +inf; cbrt(-inf) = -inf.
15478 // pow(-val, 1/3) = nan; cbrt(-val) = -num.
15479 // For regular numbers, rounding may cause the results to differ.
15480 // Therefore, we require { nsz ninf nnan afn } for this transform.
15481 // TODO: We could select out the special cases if we don't have nsz/ninf.
15482 SDNodeFlags Flags = N->getFlags();
15483 if (!Flags.hasNoSignedZeros() || !Flags.hasNoInfs() || !Flags.hasNoNaNs() ||
15484 !Flags.hasApproximateFuncs())
15487 // Do not create a cbrt() libcall if the target does not have it, and do not
15488 // turn a pow that has lowering support into a cbrt() libcall.
15489 if (!DAG.getLibInfo().has(LibFunc_cbrt) ||
15490 (!DAG.getTargetLoweringInfo().isOperationExpand(ISD::FPOW, VT) &&
15491 DAG.getTargetLoweringInfo().isOperationExpand(ISD::FCBRT, VT)))
15494 return DAG.getNode(ISD::FCBRT, SDLoc(N), VT, N->getOperand(0));
15497 // Try to convert x ** (1/4) and x ** (3/4) into square roots.
15498 // x ** (1/2) is canonicalized to sqrt, so we do not bother with that case.
15499 // TODO: This could be extended (using a target hook) to handle smaller
15500 // power-of-2 fractional exponents.
15501 bool ExponentIs025 = ExponentC->getValueAPF().isExactlyValue(0.25);
15502 bool ExponentIs075 = ExponentC->getValueAPF().isExactlyValue(0.75);
15503 if (ExponentIs025 || ExponentIs075) {
15504 // pow(-0.0, 0.25) = +0.0; sqrt(sqrt(-0.0)) = -0.0.
15505 // pow(-inf, 0.25) = +inf; sqrt(sqrt(-inf)) = NaN.
15506 // pow(-0.0, 0.75) = +0.0; sqrt(-0.0) * sqrt(sqrt(-0.0)) = +0.0.
15507 // pow(-inf, 0.75) = +inf; sqrt(-inf) * sqrt(sqrt(-inf)) = NaN.
15508 // For regular numbers, rounding may cause the results to differ.
15509 // Therefore, we require { nsz ninf afn } for this transform.
15510 // TODO: We could select out the special cases if we don't have nsz/ninf.
15511 SDNodeFlags Flags = N->getFlags();
15513 // We only need no signed zeros for the 0.25 case.
15514 if ((!Flags.hasNoSignedZeros() && ExponentIs025) || !Flags.hasNoInfs() ||
15515 !Flags.hasApproximateFuncs())
15518 // Don't double the number of libcalls. We are trying to inline fast code.
15519 if (!DAG.getTargetLoweringInfo().isOperationLegalOrCustom(ISD::FSQRT, VT))
15522 // Assume that libcalls are the smallest code.
15523 // TODO: This restriction should probably be lifted for vectors.
15527 // pow(X, 0.25) --> sqrt(sqrt(X))
15529 SDValue Sqrt = DAG.getNode(ISD::FSQRT, DL, VT, N->getOperand(0));
15530 SDValue SqrtSqrt = DAG.getNode(ISD::FSQRT, DL, VT, Sqrt);
15533 // pow(X, 0.75) --> sqrt(X) * sqrt(sqrt(X))
15534 return DAG.getNode(ISD::FMUL, DL, VT, Sqrt, SqrtSqrt);
15540 static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
15541 const TargetLowering &TLI) {
15542 // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
15543 // replacing casts with a libcall. We also must be allowed to ignore -0.0
15544 // because FTRUNC will return -0.0 for (-1.0, -0.0), but using integer
15545 // conversions would return +0.0.
15546 // FIXME: We should be able to use node-level FMF here.
15547 // TODO: If strict math, should we use FABS (+ range check for signed cast)?
15548 EVT VT = N->getValueType(0);
15549 if (!TLI.isOperationLegal(ISD::FTRUNC, VT) ||
15550 !DAG.getTarget().Options.NoSignedZerosFPMath)
15553 // fptosi/fptoui round towards zero, so converting from FP to integer and
15554 // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
15555 SDValue N0 = N->getOperand(0);
15556 if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
15557 N0.getOperand(0).getValueType() == VT)
15558 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
15560 if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
15561 N0.getOperand(0).getValueType() == VT)
15562 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
15567 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
15568 SDValue N0 = N->getOperand(0);
15569 EVT VT = N->getValueType(0);
15570 EVT OpVT = N0.getValueType();
15572 // [us]itofp(undef) = 0, because the result value is bounded.
15574 return DAG.getConstantFP(0.0, SDLoc(N), VT);
15576 // fold (sint_to_fp c1) -> c1fp
15577 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
15578 // ...but only if the target supports immediate floating-point values
15579 (!LegalOperations ||
15580 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
15581 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
15583 // If the input is a legal type, and SINT_TO_FP is not legal on this target,
15584 // but UINT_TO_FP is legal on this target, try to convert.
15585 if (!hasOperation(ISD::SINT_TO_FP, OpVT) &&
15586 hasOperation(ISD::UINT_TO_FP, OpVT)) {
15587 // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
15588 if (DAG.SignBitIsZero(N0))
15589 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
15592 // The next optimizations are desirable only if SELECT_CC can be lowered.
15593 // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)
15594 if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
15596 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
15598 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),
15599 DAG.getConstantFP(0.0, DL, VT));
15602 // fold (sint_to_fp (zext (setcc x, y, cc))) ->
15603 // (select (setcc x, y, cc), 1.0, 0.0)
15604 if (N0.getOpcode() == ISD::ZERO_EXTEND &&
15605 N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&
15606 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
15608 return DAG.getSelect(DL, VT, N0.getOperand(0),
15609 DAG.getConstantFP(1.0, DL, VT),
15610 DAG.getConstantFP(0.0, DL, VT));
15613 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
15619 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
15620 SDValue N0 = N->getOperand(0);
15621 EVT VT = N->getValueType(0);
15622 EVT OpVT = N0.getValueType();
15624 // [us]itofp(undef) = 0, because the result value is bounded.
15626 return DAG.getConstantFP(0.0, SDLoc(N), VT);
15628 // fold (uint_to_fp c1) -> c1fp
15629 if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
15630 // ...but only if the target supports immediate floating-point values
15631 (!LegalOperations ||
15632 TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
15633 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
15635 // If the input is a legal type, and UINT_TO_FP is not legal on this target,
15636 // but SINT_TO_FP is legal on this target, try to convert.
15637 if (!hasOperation(ISD::UINT_TO_FP, OpVT) &&
15638 hasOperation(ISD::SINT_TO_FP, OpVT)) {
15639 // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
15640 if (DAG.SignBitIsZero(N0))
15641 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
15644 // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)
15645 if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
15646 (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
15648 return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),
15649 DAG.getConstantFP(0.0, DL, VT));
15652 if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
15658 // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
15659 static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
15660 SDValue N0 = N->getOperand(0);
15661 EVT VT = N->getValueType(0);
15663 if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
15666 SDValue Src = N0.getOperand(0);
15667 EVT SrcVT = Src.getValueType();
15668 bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
15669 bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
15671 // We can safely assume the conversion won't overflow the output range,
15672 // because (for example) (uint8_t)18293.f is undefined behavior.
15674 // Since we can assume the conversion won't overflow, our decision as to
15675 // whether the input will fit in the float should depend on the minimum
15676 // of the input range and output range.
15678 // This means this is also safe for a signed input and unsigned output, since
15679 // a negative input would lead to undefined behavior.
15680 unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
15681 unsigned OutputSize = (int)VT.getScalarSizeInBits();
15682 unsigned ActualSize = std::min(InputSize, OutputSize);
15683 const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
15685 // We can only fold away the float conversion if the input range can be
15686 // represented exactly in the float range.
15687 if (APFloat::semanticsPrecision(sem) >= ActualSize) {
15688 if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
15689 unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
15690 : ISD::ZERO_EXTEND;
15691 return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
15693 if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
15694 return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
15695 return DAG.getBitcast(VT, Src);
15700 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
15701 SDValue N0 = N->getOperand(0);
15702 EVT VT = N->getValueType(0);
15704 // fold (fp_to_sint undef) -> undef
15706 return DAG.getUNDEF(VT);
15708 // fold (fp_to_sint c1fp) -> c1
15709 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15710 return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
15712 return FoldIntToFPToInt(N, DAG);
15715 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
15716 SDValue N0 = N->getOperand(0);
15717 EVT VT = N->getValueType(0);
15719 // fold (fp_to_uint undef) -> undef
15721 return DAG.getUNDEF(VT);
15723 // fold (fp_to_uint c1fp) -> c1
15724 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15725 return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
15727 return FoldIntToFPToInt(N, DAG);
15730 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
15731 SDValue N0 = N->getOperand(0);
15732 SDValue N1 = N->getOperand(1);
15733 ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
15734 EVT VT = N->getValueType(0);
15736 // fold (fp_round c1fp) -> c1fp
15738 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);
15740 // fold (fp_round (fp_extend x)) -> x
15741 if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
15742 return N0.getOperand(0);
15744 // fold (fp_round (fp_round x)) -> (fp_round x)
15745 if (N0.getOpcode() == ISD::FP_ROUND) {
15746 const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
15747 const bool N0IsTrunc = N0.getConstantOperandVal(1) == 1;
15749 // Skip this folding if it results in an fp_round from f80 to f16.
15751 // f80 to f16 always generates an expensive (and as yet, unimplemented)
15752 // libcall to __truncxfhf2 instead of selecting native f16 conversion
15753 // instructions from f32 or f64. Moreover, the first (value-preserving)
15754 // fp_round from f80 to either f32 or f64 may become a NOP in platforms like
15756 if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
15759 // If the first fp_round isn't a value preserving truncation, it might
15760 // introduce a tie in the second fp_round, that wouldn't occur in the
15761 // single-step fp_round we want to fold to.
15762 // In other words, double rounding isn't the same as rounding.
15763 // Also, this is a value preserving truncation iff both fp_round's are.
15764 if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) {
15766 return DAG.getNode(
15767 ISD::FP_ROUND, DL, VT, N0.getOperand(0),
15768 DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL, /*isTarget=*/true));
15772 // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
15773 if (N0.getOpcode() == ISD::FCOPYSIGN && N0->hasOneUse()) {
15774 SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
15775 N0.getOperand(0), N1);
15776 AddToWorklist(Tmp.getNode());
15777 return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
15778 Tmp, N0.getOperand(1));
15781 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
15787 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
15788 SDValue N0 = N->getOperand(0);
15789 EVT VT = N->getValueType(0);
15791 // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
15792 if (N->hasOneUse() &&
15793 N->use_begin()->getOpcode() == ISD::FP_ROUND)
15796 // fold (fp_extend c1fp) -> c1fp
15797 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15798 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
15800 // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
15801 if (N0.getOpcode() == ISD::FP16_TO_FP &&
15802 TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
15803 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));
15805 // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
15807 if (N0.getOpcode() == ISD::FP_ROUND
15808 && N0.getConstantOperandVal(1) == 1) {
15809 SDValue In = N0.getOperand(0);
15810 if (In.getValueType() == VT) return In;
15811 if (VT.bitsLT(In.getValueType()))
15812 return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
15813 In, N0.getOperand(1));
15814 return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
15817 // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
15818 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
15819 TLI.isLoadExtLegalOrCustom(ISD::EXTLOAD, VT, N0.getValueType())) {
15820 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15821 SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
15823 LN0->getBasePtr(), N0.getValueType(),
15824 LN0->getMemOperand());
15825 CombineTo(N, ExtLoad);
15828 DAG.getNode(ISD::FP_ROUND, SDLoc(N0), N0.getValueType(), ExtLoad,
15829 DAG.getIntPtrConstant(1, SDLoc(N0), /*isTarget=*/true)),
15830 ExtLoad.getValue(1));
15831 return SDValue(N, 0); // Return N so it doesn't get rechecked!
15834 if (SDValue NewVSel = matchVSelectOpSizesWithSetCC(N))
15840 SDValue DAGCombiner::visitFCEIL(SDNode *N) {
15841 SDValue N0 = N->getOperand(0);
15842 EVT VT = N->getValueType(0);
15844 // fold (fceil c1) -> fceil(c1)
15845 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15846 return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
15851 SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
15852 SDValue N0 = N->getOperand(0);
15853 EVT VT = N->getValueType(0);
15855 // fold (ftrunc c1) -> ftrunc(c1)
15856 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15857 return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
15859 // fold ftrunc (known rounded int x) -> x
15860 // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
15861 // likely to be generated to extract integer from a rounded floating value.
15862 switch (N0.getOpcode()) {
15866 case ISD::FNEARBYINT:
15875 SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
15876 SDValue N0 = N->getOperand(0);
15877 EVT VT = N->getValueType(0);
15879 // fold (ffloor c1) -> ffloor(c1)
15880 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15881 return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
15886 SDValue DAGCombiner::visitFNEG(SDNode *N) {
15887 SDValue N0 = N->getOperand(0);
15888 EVT VT = N->getValueType(0);
15889 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15891 // Constant fold FNEG.
15892 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15893 return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
15895 if (SDValue NegN0 =
15896 TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))
15899 // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
15900 // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
15901 // know it was called from a context with a nsz flag if the input fsub does
15903 if (N0.getOpcode() == ISD::FSUB &&
15904 (DAG.getTarget().Options.NoSignedZerosFPMath ||
15905 N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
15906 return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1),
15910 if (SDValue Cast = foldSignChangeInBitcast(N))
15916 SDValue DAGCombiner::visitFMinMax(SDNode *N) {
15917 SDValue N0 = N->getOperand(0);
15918 SDValue N1 = N->getOperand(1);
15919 EVT VT = N->getValueType(0);
15920 const SDNodeFlags Flags = N->getFlags();
15921 unsigned Opc = N->getOpcode();
15922 bool PropagatesNaN = Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM;
15923 bool IsMin = Opc == ISD::FMINNUM || Opc == ISD::FMINIMUM;
15924 SelectionDAG::FlagInserter FlagsInserter(DAG, N);
15927 if (SDValue C = DAG.FoldConstantArithmetic(Opc, SDLoc(N), VT, {N0, N1}))
15930 // Canonicalize to constant on RHS.
15931 if (DAG.isConstantFPBuildVectorOrConstantFP(N0) &&
15932 !DAG.isConstantFPBuildVectorOrConstantFP(N1))
15933 return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
15935 if (const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1)) {
15936 const APFloat &AF = N1CFP->getValueAPF();
15938 // minnum(X, nan) -> X
15939 // maxnum(X, nan) -> X
15940 // minimum(X, nan) -> nan
15941 // maximum(X, nan) -> nan
15943 return PropagatesNaN ? N->getOperand(1) : N->getOperand(0);
15945 // In the following folds, inf can be replaced with the largest finite
15946 // float, if the ninf flag is set.
15947 if (AF.isInfinity() || (Flags.hasNoInfs() && AF.isLargest())) {
15948 // minnum(X, -inf) -> -inf
15949 // maxnum(X, +inf) -> +inf
15950 // minimum(X, -inf) -> -inf if nnan
15951 // maximum(X, +inf) -> +inf if nnan
15952 if (IsMin == AF.isNegative() && (!PropagatesNaN || Flags.hasNoNaNs()))
15953 return N->getOperand(1);
15955 // minnum(X, +inf) -> X if nnan
15956 // maxnum(X, -inf) -> X if nnan
15957 // minimum(X, +inf) -> X
15958 // maximum(X, -inf) -> X
15959 if (IsMin != AF.isNegative() && (PropagatesNaN || Flags.hasNoNaNs()))
15960 return N->getOperand(0);
15967 SDValue DAGCombiner::visitFABS(SDNode *N) {
15968 SDValue N0 = N->getOperand(0);
15969 EVT VT = N->getValueType(0);
15971 // fold (fabs c1) -> fabs(c1)
15972 if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
15973 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
15975 // fold (fabs (fabs x)) -> (fabs x)
15976 if (N0.getOpcode() == ISD::FABS)
15977 return N->getOperand(0);
15979 // fold (fabs (fneg x)) -> (fabs x)
15980 // fold (fabs (fcopysign x, y)) -> (fabs x)
15981 if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
15982 return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
15984 if (SDValue Cast = foldSignChangeInBitcast(N))
15990 SDValue DAGCombiner::visitBRCOND(SDNode *N) {
15991 SDValue Chain = N->getOperand(0);
15992 SDValue N1 = N->getOperand(1);
15993 SDValue N2 = N->getOperand(2);
15995 // BRCOND(FREEZE(cond)) is equivalent to BRCOND(cond) (both are
15996 // nondeterministic jumps).
15997 if (N1->getOpcode() == ISD::FREEZE && N1.hasOneUse()) {
15998 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain,
15999 N1->getOperand(0), N2);
16002 // If N is a constant we could fold this into a fallthrough or unconditional
16003 // branch. However that doesn't happen very often in normal code, because
16004 // Instcombine/SimplifyCFG should have handled the available opportunities.
16005 // If we did this folding here, it would be necessary to update the
16006 // MachineBasicBlock CFG, which is awkward.
16008 // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
16010 if (N1.getOpcode() == ISD::SETCC &&
16011 TLI.isOperationLegalOrCustom(ISD::BR_CC,
16012 N1.getOperand(0).getValueType())) {
16013 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
16014 Chain, N1.getOperand(2),
16015 N1.getOperand(0), N1.getOperand(1), N2);
16018 if (N1.hasOneUse()) {
16019 // rebuildSetCC calls visitXor which may change the Chain when there is a
16020 // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes.
16021 HandleSDNode ChainHandle(Chain);
16022 if (SDValue NewN1 = rebuildSetCC(N1))
16023 return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other,
16024 ChainHandle.getValue(), NewN1, N2);
16030 SDValue DAGCombiner::rebuildSetCC(SDValue N) {
16031 if (N.getOpcode() == ISD::SRL ||
16032 (N.getOpcode() == ISD::TRUNCATE &&
16033 (N.getOperand(0).hasOneUse() &&
16034 N.getOperand(0).getOpcode() == ISD::SRL))) {
16035 // Look pass the truncate.
16036 if (N.getOpcode() == ISD::TRUNCATE)
16037 N = N.getOperand(0);
16039 // Match this pattern so that we can generate simpler code:
16042 // %b = and i32 %a, 2
16043 // %c = srl i32 %b, 1
16044 // brcond i32 %c ...
16049 // %b = and i32 %a, 2
16050 // %c = setcc eq %b, 0
16053 // This applies only when the AND constant value has one bit set and the
16054 // SRL constant is equal to the log2 of the AND constant. The back-end is
16055 // smart enough to convert the result into a TEST/JMP sequence.
16056 SDValue Op0 = N.getOperand(0);
16057 SDValue Op1 = N.getOperand(1);
16059 if (Op0.getOpcode() == ISD::AND && Op1.getOpcode() == ISD::Constant) {
16060 SDValue AndOp1 = Op0.getOperand(1);
16062 if (AndOp1.getOpcode() == ISD::Constant) {
16063 const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
16065 if (AndConst.isPowerOf2() &&
16066 cast<ConstantSDNode>(Op1)->getAPIntValue() == AndConst.logBase2()) {
16068 return DAG.getSetCC(DL, getSetCCResultType(Op0.getValueType()),
16069 Op0, DAG.getConstant(0, DL, Op0.getValueType()),
16076 // Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne))
16077 // Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq))
16078 if (N.getOpcode() == ISD::XOR) {
16079 // Because we may call this on a speculatively constructed
16080 // SimplifiedSetCC Node, we need to simplify this node first.
16081 // Ideally this should be folded into SimplifySetCC and not
16082 // here. For now, grab a handle to N so we don't lose it from
16083 // replacements interal to the visit.
16084 HandleSDNode XORHandle(N);
16085 while (N.getOpcode() == ISD::XOR) {
16086 SDValue Tmp = visitXOR(N.getNode());
16087 // No simplification done.
16088 if (!Tmp.getNode())
16090 // Returning N is form in-visit replacement that may invalidated
16091 // N. Grab value from Handle.
16092 if (Tmp.getNode() == N.getNode())
16093 N = XORHandle.getValue();
16094 else // Node simplified. Try simplifying again.
16098 if (N.getOpcode() != ISD::XOR)
16101 SDValue Op0 = N->getOperand(0);
16102 SDValue Op1 = N->getOperand(1);
16104 if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
16105 bool Equal = false;
16106 // (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
16107 if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR &&
16108 Op0.getValueType() == MVT::i1) {
16110 Op0 = N->getOperand(0);
16111 Op1 = N->getOperand(1);
16115 EVT SetCCVT = N.getValueType();
16117 SetCCVT = getSetCCResultType(SetCCVT);
16118 // Replace the uses of XOR with SETCC
16119 return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1,
16120 Equal ? ISD::SETEQ : ISD::SETNE);
16127 // Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
16129 SDValue DAGCombiner::visitBR_CC(SDNode *N) {
16130 CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
16131 SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
16133 // If N is a constant we could fold this into a fallthrough or unconditional
16134 // branch. However that doesn't happen very often in normal code, because
16135 // Instcombine/SimplifyCFG should have handled the available opportunities.
16136 // If we did this folding here, it would be necessary to update the
16137 // MachineBasicBlock CFG, which is awkward.
16139 // Use SimplifySetCC to simplify SETCC's.
16140 SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
16141 CondLHS, CondRHS, CC->get(), SDLoc(N),
16143 if (Simp.getNode()) AddToWorklist(Simp.getNode());
16145 // fold to a simpler setcc
16146 if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
16147 return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
16148 N->getOperand(0), Simp.getOperand(2),
16149 Simp.getOperand(0), Simp.getOperand(1),
16155 static bool getCombineLoadStoreParts(SDNode *N, unsigned Inc, unsigned Dec,
16156 bool &IsLoad, bool &IsMasked, SDValue &Ptr,
16157 const TargetLowering &TLI) {
16158 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
16159 if (LD->isIndexed())
16161 EVT VT = LD->getMemoryVT();
16162 if (!TLI.isIndexedLoadLegal(Inc, VT) && !TLI.isIndexedLoadLegal(Dec, VT))
16164 Ptr = LD->getBasePtr();
16165 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
16166 if (ST->isIndexed())
16168 EVT VT = ST->getMemoryVT();
16169 if (!TLI.isIndexedStoreLegal(Inc, VT) && !TLI.isIndexedStoreLegal(Dec, VT))
16171 Ptr = ST->getBasePtr();
16173 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
16174 if (LD->isIndexed())
16176 EVT VT = LD->getMemoryVT();
16177 if (!TLI.isIndexedMaskedLoadLegal(Inc, VT) &&
16178 !TLI.isIndexedMaskedLoadLegal(Dec, VT))
16180 Ptr = LD->getBasePtr();
16182 } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
16183 if (ST->isIndexed())
16185 EVT VT = ST->getMemoryVT();
16186 if (!TLI.isIndexedMaskedStoreLegal(Inc, VT) &&
16187 !TLI.isIndexedMaskedStoreLegal(Dec, VT))
16189 Ptr = ST->getBasePtr();
16198 /// Try turning a load/store into a pre-indexed load/store when the base
16199 /// pointer is an add or subtract and it has other uses besides the load/store.
16200 /// After the transformation, the new indexed load/store has effectively folded
16201 /// the add/subtract in and all of its other uses are redirected to the
16202 /// new load/store.
16203 bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
16204 if (Level < AfterLegalizeDAG)
16207 bool IsLoad = true;
16208 bool IsMasked = false;
16210 if (!getCombineLoadStoreParts(N, ISD::PRE_INC, ISD::PRE_DEC, IsLoad, IsMasked,
16214 // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
16215 // out. There is no reason to make this a preinc/predec.
16216 if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
16220 // Ask the target to do addressing mode selection.
16223 ISD::MemIndexedMode AM = ISD::UNINDEXED;
16224 if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
16227 // Backends without true r+i pre-indexed forms may need to pass a
16228 // constant base with a variable offset so that constant coercion
16229 // will work with the patterns in canonical form.
16230 bool Swapped = false;
16231 if (isa<ConstantSDNode>(BasePtr)) {
16232 std::swap(BasePtr, Offset);
16236 // Don't create a indexed load / store with zero offset.
16237 if (isNullConstant(Offset))
16240 // Try turning it into a pre-indexed load / store except when:
16241 // 1) The new base ptr is a frame index.
16242 // 2) If N is a store and the new base ptr is either the same as or is a
16243 // predecessor of the value being stored.
16244 // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
16245 // that would create a cycle.
16246 // 4) All uses are load / store ops that use it as old base ptr.
16248 // Check #1. Preinc'ing a frame index would require copying the stack pointer
16249 // (plus the implicit offset) to a register to preinc anyway.
16250 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
16255 SDValue Val = IsMasked ? cast<MaskedStoreSDNode>(N)->getValue()
16256 : cast<StoreSDNode>(N)->getValue();
16258 // Would require a copy.
16259 if (Val == BasePtr)
16262 // Would create a cycle.
16263 if (Val == Ptr || Ptr->isPredecessorOf(Val.getNode()))
16267 // Caches for hasPredecessorHelper.
16268 SmallPtrSet<const SDNode *, 32> Visited;
16269 SmallVector<const SDNode *, 16> Worklist;
16270 Worklist.push_back(N);
16272 // If the offset is a constant, there may be other adds of constants that
16273 // can be folded with this one. We should do this to avoid having to keep
16274 // a copy of the original base pointer.
16275 SmallVector<SDNode *, 16> OtherUses;
16276 if (isa<ConstantSDNode>(Offset))
16277 for (SDNode::use_iterator UI = BasePtr->use_begin(),
16278 UE = BasePtr->use_end();
16280 SDUse &Use = UI.getUse();
16281 // Skip the use that is Ptr and uses of other results from BasePtr's
16282 // node (important for nodes that return multiple results).
16283 if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
16286 if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
16289 if (Use.getUser()->getOpcode() != ISD::ADD &&
16290 Use.getUser()->getOpcode() != ISD::SUB) {
16295 SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
16296 if (!isa<ConstantSDNode>(Op1)) {
16301 // FIXME: In some cases, we can be smarter about this.
16302 if (Op1.getValueType() != Offset.getValueType()) {
16307 OtherUses.push_back(Use.getUser());
16311 std::swap(BasePtr, Offset);
16313 // Now check for #3 and #4.
16314 bool RealUse = false;
16316 for (SDNode *Use : Ptr->uses()) {
16319 if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
16322 // If Ptr may be folded in addressing mode of other use, then it's
16323 // not profitable to do this transformation.
16324 if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
16334 Result = DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
16337 DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM);
16340 Result = DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
16343 Result = DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr,
16348 LLVM_DEBUG(dbgs() << "\nReplacing.4 "; N->dump(&DAG); dbgs() << "\nWith: ";
16349 Result.dump(&DAG); dbgs() << '\n');
16350 WorklistRemover DeadNodes(*this);
16352 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
16353 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
16355 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
16358 // Finally, since the node is now dead, remove it from the graph.
16359 deleteAndRecombine(N);
16362 std::swap(BasePtr, Offset);
16364 // Replace other uses of BasePtr that can be updated to use Ptr
16365 for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
16366 unsigned OffsetIdx = 1;
16367 if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
16369 assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
16370 BasePtr.getNode() && "Expected BasePtr operand");
16372 // We need to replace ptr0 in the following expression:
16373 // x0 * offset0 + y0 * ptr0 = t0
16375 // x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
16377 // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
16378 // indexed load/store and the expression that needs to be re-written.
16380 // Therefore, we have:
16381 // t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
16383 auto *CN = cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
16384 const APInt &Offset0 = CN->getAPIntValue();
16385 const APInt &Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
16386 int X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
16387 int Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
16388 int X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
16389 int Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
16391 unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;
16393 APInt CNV = Offset0;
16394 if (X0 < 0) CNV = -CNV;
16395 if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
16396 else CNV = CNV - Offset1;
16398 SDLoc DL(OtherUses[i]);
16400 // We can now generate the new expression.
16401 SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
16402 SDValue NewOp2 = Result.getValue(IsLoad ? 1 : 0);
16404 SDValue NewUse = DAG.getNode(Opcode,
16406 OtherUses[i]->getValueType(0), NewOp1, NewOp2);
16407 DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
16408 deleteAndRecombine(OtherUses[i]);
16411 // Replace the uses of Ptr with uses of the updated base value.
16412 DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(IsLoad ? 1 : 0));
16413 deleteAndRecombine(Ptr.getNode());
16414 AddToWorklist(Result.getNode());
16419 static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
16420 SDValue &BasePtr, SDValue &Offset,
16421 ISD::MemIndexedMode &AM,
16423 const TargetLowering &TLI) {
16425 (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))
16428 if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))
16431 // Don't create a indexed load / store with zero offset.
16432 if (isNullConstant(Offset))
16435 if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
16438 SmallPtrSet<const SDNode *, 32> Visited;
16439 for (SDNode *Use : BasePtr->uses()) {
16440 if (Use == Ptr.getNode())
16443 // No if there's a later user which could perform the index instead.
16444 if (isa<MemSDNode>(Use)) {
16445 bool IsLoad = true;
16446 bool IsMasked = false;
16448 if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
16449 IsMasked, OtherPtr, TLI)) {
16450 SmallVector<const SDNode *, 2> Worklist;
16451 Worklist.push_back(Use);
16452 if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
16457 // If all the uses are load / store addresses, then don't do the
16459 if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
16460 for (SDNode *UseUse : Use->uses())
16461 if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
16468 static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
16469 bool &IsMasked, SDValue &Ptr,
16470 SDValue &BasePtr, SDValue &Offset,
16471 ISD::MemIndexedMode &AM,
16473 const TargetLowering &TLI) {
16474 if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
16475 IsMasked, Ptr, TLI) ||
16479 // Try turning it into a post-indexed load / store except when
16480 // 1) All uses are load / store ops that use it as base ptr (and
16481 // it may be folded as addressing mmode).
16482 // 2) Op must be independent of N, i.e. Op is neither a predecessor
16483 // nor a successor of N. Otherwise, if Op is folded that would
16485 for (SDNode *Op : Ptr->uses()) {
16487 if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
16491 SmallPtrSet<const SDNode *, 32> Visited;
16492 SmallVector<const SDNode *, 8> Worklist;
16493 // Ptr is predecessor to both N and Op.
16494 Visited.insert(Ptr.getNode());
16495 Worklist.push_back(N);
16496 Worklist.push_back(Op);
16497 if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
16498 !SDNode::hasPredecessorHelper(Op, Visited, Worklist))
16504 /// Try to combine a load/store with a add/sub of the base pointer node into a
16505 /// post-indexed load/store. The transformation folded the add/subtract into the
16506 /// new indexed load/store effectively and all of its uses are redirected to the
16507 /// new load/store.
16508 bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
16509 if (Level < AfterLegalizeDAG)
16512 bool IsLoad = true;
16513 bool IsMasked = false;
16517 ISD::MemIndexedMode AM = ISD::UNINDEXED;
16518 SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,
16519 Offset, AM, DAG, TLI);
16525 Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
16527 : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
16528 BasePtr, Offset, AM);
16530 Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
16531 BasePtr, Offset, AM)
16532 : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
16533 BasePtr, Offset, AM);
16534 ++PostIndexedNodes;
16536 LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); dbgs() << "\nWith: ";
16537 Result.dump(&DAG); dbgs() << '\n');
16538 WorklistRemover DeadNodes(*this);
16540 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
16541 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
16543 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
16546 // Finally, since the node is now dead, remove it from the graph.
16547 deleteAndRecombine(N);
16549 // Replace the uses of Use with uses of the updated base value.
16550 DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
16551 Result.getValue(IsLoad ? 1 : 0));
16552 deleteAndRecombine(Op);
16556 /// Return the base-pointer arithmetic from an indexed \p LD.
16557 SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
16558 ISD::MemIndexedMode AM = LD->getAddressingMode();
16559 assert(AM != ISD::UNINDEXED);
16560 SDValue BP = LD->getOperand(1);
16561 SDValue Inc = LD->getOperand(2);
16563 // Some backends use TargetConstants for load offsets, but don't expect
16564 // TargetConstants in general ADD nodes. We can convert these constants into
16565 // regular Constants (if the constant is not opaque).
16566 assert((Inc.getOpcode() != ISD::TargetConstant ||
16567 !cast<ConstantSDNode>(Inc)->isOpaque()) &&
16568 "Cannot split out indexing using opaque target constants");
16569 if (Inc.getOpcode() == ISD::TargetConstant) {
16570 ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
16571 Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
16572 ConstInc->getValueType(0));
16576 (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
16577 return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
16580 static inline ElementCount numVectorEltsOrZero(EVT T) {
16581 return T.isVector() ? T.getVectorElementCount() : ElementCount::getFixed(0);
16584 bool DAGCombiner::getTruncatedStoreValue(StoreSDNode *ST, SDValue &Val) {
16585 Val = ST->getValue();
16586 EVT STType = Val.getValueType();
16587 EVT STMemType = ST->getMemoryVT();
16588 if (STType == STMemType)
16590 if (isTypeLegal(STMemType))
16591 return false; // fail.
16592 if (STType.isFloatingPoint() && STMemType.isFloatingPoint() &&
16593 TLI.isOperationLegal(ISD::FTRUNC, STMemType)) {
16594 Val = DAG.getNode(ISD::FTRUNC, SDLoc(ST), STMemType, Val);
16597 if (numVectorEltsOrZero(STType) == numVectorEltsOrZero(STMemType) &&
16598 STType.isInteger() && STMemType.isInteger()) {
16599 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(ST), STMemType, Val);
16602 if (STType.getSizeInBits() == STMemType.getSizeInBits()) {
16603 Val = DAG.getBitcast(STMemType, Val);
16606 return false; // fail.
16609 bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
16610 EVT LDMemType = LD->getMemoryVT();
16611 EVT LDType = LD->getValueType(0);
16612 assert(Val.getValueType() == LDMemType &&
16613 "Attempting to extend value of non-matching type");
16614 if (LDType == LDMemType)
16616 if (LDMemType.isInteger() && LDType.isInteger()) {
16617 switch (LD->getExtensionType()) {
16618 case ISD::NON_EXTLOAD:
16619 Val = DAG.getBitcast(LDType, Val);
16622 Val = DAG.getNode(ISD::ANY_EXTEND, SDLoc(LD), LDType, Val);
16624 case ISD::SEXTLOAD:
16625 Val = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(LD), LDType, Val);
16627 case ISD::ZEXTLOAD:
16628 Val = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(LD), LDType, Val);
16635 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
16636 if (OptLevel == CodeGenOpt::None || !LD->isSimple())
16638 SDValue Chain = LD->getOperand(0);
16639 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
16640 // TODO: Relax this restriction for unordered atomics (see D66309)
16641 if (!ST || !ST->isSimple())
16644 EVT LDType = LD->getValueType(0);
16645 EVT LDMemType = LD->getMemoryVT();
16646 EVT STMemType = ST->getMemoryVT();
16647 EVT STType = ST->getValue().getValueType();
16649 // There are two cases to consider here:
16650 // 1. The store is fixed width and the load is scalable. In this case we
16651 // don't know at compile time if the store completely envelops the load
16652 // so we abandon the optimisation.
16653 // 2. The store is scalable and the load is fixed width. We could
16654 // potentially support a limited number of cases here, but there has been
16655 // no cost-benefit analysis to prove it's worth it.
16656 bool LdStScalable = LDMemType.isScalableVector();
16657 if (LdStScalable != STMemType.isScalableVector())
16660 // If we are dealing with scalable vectors on a big endian platform the
16661 // calculation of offsets below becomes trickier, since we do not know at
16662 // compile time the absolute size of the vector. Until we've done more
16663 // analysis on big-endian platforms it seems better to bail out for now.
16664 if (LdStScalable && DAG.getDataLayout().isBigEndian())
16667 BaseIndexOffset BasePtrLD = BaseIndexOffset::match(LD, DAG);
16668 BaseIndexOffset BasePtrST = BaseIndexOffset::match(ST, DAG);
16670 if (!BasePtrST.equalBaseIndex(BasePtrLD, DAG, Offset))
16673 // Normalize for Endianness. After this Offset=0 will denote that the least
16674 // significant bit in the loaded value maps to the least significant bit in
16675 // the stored value). With Offset=n (for n > 0) the loaded value starts at the
16676 // n:th least significant byte of the stored value.
16677 if (DAG.getDataLayout().isBigEndian())
16678 Offset = ((int64_t)STMemType.getStoreSizeInBits().getFixedSize() -
16679 (int64_t)LDMemType.getStoreSizeInBits().getFixedSize()) /
16683 // Check that the stored value cover all bits that are loaded.
16686 TypeSize LdMemSize = LDMemType.getSizeInBits();
16687 TypeSize StMemSize = STMemType.getSizeInBits();
16689 STCoversLD = (Offset == 0) && LdMemSize == StMemSize;
16691 STCoversLD = (Offset >= 0) && (Offset * 8 + LdMemSize.getFixedSize() <=
16692 StMemSize.getFixedSize());
16694 auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
16695 if (LD->isIndexed()) {
16696 // Cannot handle opaque target constants and we must respect the user's
16697 // request not to split indexes from loads.
16698 if (!canSplitIdx(LD))
16700 SDValue Idx = SplitIndexingFromLoad(LD);
16701 SDValue Ops[] = {Val, Idx, Chain};
16702 return CombineTo(LD, Ops, 3);
16704 return CombineTo(LD, Val, Chain);
16710 // Memory as copy space (potentially masked).
16711 if (Offset == 0 && LDType == STType && STMemType == LDMemType) {
16712 // Simple case: Direct non-truncating forwarding
16713 if (LDType.getSizeInBits() == LdMemSize)
16714 return ReplaceLd(LD, ST->getValue(), Chain);
16715 // Can we model the truncate and extension with an and mask?
16716 if (STType.isInteger() && LDMemType.isInteger() && !STType.isVector() &&
16717 !LDMemType.isVector() && LD->getExtensionType() != ISD::SEXTLOAD) {
16718 // Mask to size of LDMemType
16720 DAG.getConstant(APInt::getLowBitsSet(STType.getFixedSizeInBits(),
16721 StMemSize.getFixedSize()),
16722 SDLoc(ST), STType);
16723 auto Val = DAG.getNode(ISD::AND, SDLoc(LD), LDType, ST->getValue(), Mask);
16724 return ReplaceLd(LD, Val, Chain);
16728 // TODO: Deal with nonzero offset.
16729 if (LD->getBasePtr().isUndef() || Offset != 0)
16731 // Model necessary truncations / extenstions.
16733 // Truncate Value To Stored Memory Size.
16735 if (!getTruncatedStoreValue(ST, Val))
16737 if (!isTypeLegal(LDMemType))
16739 if (STMemType != LDMemType) {
16740 // TODO: Support vectors? This requires extract_subvector/bitcast.
16741 if (!STMemType.isVector() && !LDMemType.isVector() &&
16742 STMemType.isInteger() && LDMemType.isInteger())
16743 Val = DAG.getNode(ISD::TRUNCATE, SDLoc(LD), LDMemType, Val);
16747 if (!extendLoadedValueToExtension(LD, Val))
16749 return ReplaceLd(LD, Val, Chain);
16752 // On failure, cleanup dead nodes we may have created.
16753 if (Val->use_empty())
16754 deleteAndRecombine(Val.getNode());
16758 SDValue DAGCombiner::visitLOAD(SDNode *N) {
16759 LoadSDNode *LD = cast<LoadSDNode>(N);
16760 SDValue Chain = LD->getChain();
16761 SDValue Ptr = LD->getBasePtr();
16763 // If load is not volatile and there are no uses of the loaded value (and
16764 // the updated indexed value in case of indexed loads), change uses of the
16765 // chain value into uses of the chain input (i.e. delete the dead load).
16766 // TODO: Allow this for unordered atomics (see D66309)
16767 if (LD->isSimple()) {
16768 if (N->getValueType(1) == MVT::Other) {
16769 // Unindexed loads.
16770 if (!N->hasAnyUseOfValue(0)) {
16771 // It's not safe to use the two value CombineTo variant here. e.g.
16772 // v1, chain2 = load chain1, loc
16773 // v2, chain3 = load chain2, loc
16775 // Now we replace use of chain2 with chain1. This makes the second load
16776 // isomorphic to the one we are deleting, and thus makes this load live.
16777 LLVM_DEBUG(dbgs() << "\nReplacing.6 "; N->dump(&DAG);
16778 dbgs() << "\nWith chain: "; Chain.dump(&DAG);
16780 WorklistRemover DeadNodes(*this);
16781 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
16782 AddUsersToWorklist(Chain.getNode());
16783 if (N->use_empty())
16784 deleteAndRecombine(N);
16786 return SDValue(N, 0); // Return N so it doesn't get rechecked!
16790 assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
16792 // If this load has an opaque TargetConstant offset, then we cannot split
16793 // the indexing into an add/sub directly (that TargetConstant may not be
16794 // valid for a different type of node, and we cannot convert an opaque
16795 // target constant into a regular constant).
16796 bool CanSplitIdx = canSplitIdx(LD);
16798 if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) {
16799 SDValue Undef = DAG.getUNDEF(N->getValueType(0));
16801 if (N->hasAnyUseOfValue(1) && CanSplitIdx) {
16802 Index = SplitIndexingFromLoad(LD);
16803 // Try to fold the base pointer arithmetic into subsequent loads and
16805 AddUsersToWorklist(N);
16807 Index = DAG.getUNDEF(N->getValueType(1));
16808 LLVM_DEBUG(dbgs() << "\nReplacing.7 "; N->dump(&DAG);
16809 dbgs() << "\nWith: "; Undef.dump(&DAG);
16810 dbgs() << " and 2 other values\n");
16811 WorklistRemover DeadNodes(*this);
16812 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
16813 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
16814 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
16815 deleteAndRecombine(N);
16816 return SDValue(N, 0); // Return N so it doesn't get rechecked!
16821 // If this load is directly stored, replace the load value with the stored
16823 if (auto V = ForwardStoreValueToDirectLoad(LD))
16826 // Try to infer better alignment information than the load already has.
16827 if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {
16828 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
16829 if (*Alignment > LD->getAlign() &&
16830 isAligned(*Alignment, LD->getSrcValueOffset())) {
16831 SDValue NewLoad = DAG.getExtLoad(
16832 LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
16833 LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,
16834 LD->getMemOperand()->getFlags(), LD->getAAInfo());
16835 // NewLoad will always be N as we are only refining the alignment
16836 assert(NewLoad.getNode() == N);
16842 if (LD->isUnindexed()) {
16843 // Walk up chain skipping non-aliasing memory nodes.
16844 SDValue BetterChain = FindBetterChain(LD, Chain);
16846 // If there is a better chain.
16847 if (Chain != BetterChain) {
16850 // Replace the chain to void dependency.
16851 if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
16852 ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
16853 BetterChain, Ptr, LD->getMemOperand());
16855 ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
16856 LD->getValueType(0),
16857 BetterChain, Ptr, LD->getMemoryVT(),
16858 LD->getMemOperand());
16861 // Create token factor to keep old chain connected.
16862 SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
16863 MVT::Other, Chain, ReplLoad.getValue(1));
16865 // Replace uses with load result and token factor
16866 return CombineTo(N, ReplLoad.getValue(0), Token);
16870 // Try transforming N to an indexed load.
16871 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
16872 return SDValue(N, 0);
16874 // Try to slice up N to more direct loads if the slices are mapped to
16875 // different register banks or pairing can take place.
16876 if (SliceUpLoad(N))
16877 return SDValue(N, 0);
16884 /// Helper structure used to slice a load in smaller loads.
16885 /// Basically a slice is obtained from the following sequence:
16886 /// Origin = load Ty1, Base
16887 /// Shift = srl Ty1 Origin, CstTy Amount
16888 /// Inst = trunc Shift to Ty2
16890 /// Then, it will be rewritten into:
16891 /// Slice = load SliceTy, Base + SliceOffset
16892 /// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
16894 /// SliceTy is deduced from the number of bits that are actually used to
16896 struct LoadedSlice {
16897 /// Helper structure used to compute the cost of a slice.
16899 /// Are we optimizing for code size.
16900 bool ForCodeSize = false;
16903 unsigned Loads = 0;
16904 unsigned Truncates = 0;
16905 unsigned CrossRegisterBanksCopies = 0;
16906 unsigned ZExts = 0;
16907 unsigned Shift = 0;
16909 explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {}
16911 /// Get the cost of one isolated slice.
16912 Cost(const LoadedSlice &LS, bool ForCodeSize)
16913 : ForCodeSize(ForCodeSize), Loads(1) {
16914 EVT TruncType = LS.Inst->getValueType(0);
16915 EVT LoadedType = LS.getLoadedType();
16916 if (TruncType != LoadedType &&
16917 !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
16921 /// Account for slicing gain in the current cost.
16922 /// Slicing provide a few gains like removing a shift or a
16923 /// truncate. This method allows to grow the cost of the original
16924 /// load with the gain from this slice.
16925 void addSliceGain(const LoadedSlice &LS) {
16926 // Each slice saves a truncate.
16927 const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
16928 if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
16929 LS.Inst->getValueType(0)))
16931 // If there is a shift amount, this slice gets rid of it.
16934 // If this slice can merge a cross register bank copy, account for it.
16935 if (LS.canMergeExpensiveCrossRegisterBankCopy())
16936 ++CrossRegisterBanksCopies;
16939 Cost &operator+=(const Cost &RHS) {
16940 Loads += RHS.Loads;
16941 Truncates += RHS.Truncates;
16942 CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
16943 ZExts += RHS.ZExts;
16944 Shift += RHS.Shift;
16948 bool operator==(const Cost &RHS) const {
16949 return Loads == RHS.Loads && Truncates == RHS.Truncates &&
16950 CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
16951 ZExts == RHS.ZExts && Shift == RHS.Shift;
16954 bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
16956 bool operator<(const Cost &RHS) const {
16957 // Assume cross register banks copies are as expensive as loads.
16958 // FIXME: Do we want some more target hooks?
16959 unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
16960 unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
16961 // Unless we are optimizing for code size, consider the
16962 // expensive operation first.
16963 if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
16964 return ExpensiveOpsLHS < ExpensiveOpsRHS;
16965 return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
16966 (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
16969 bool operator>(const Cost &RHS) const { return RHS < *this; }
16971 bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
16973 bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
16976 // The last instruction that represent the slice. This should be a
16977 // truncate instruction.
16980 // The original load instruction.
16981 LoadSDNode *Origin;
16983 // The right shift amount in bits from the original load.
16986 // The DAG from which Origin came from.
16987 // This is used to get some contextual information about legal types, etc.
16990 LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr,
16991 unsigned Shift = 0, SelectionDAG *DAG = nullptr)
16992 : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
16994 /// Get the bits used in a chunk of bits \p BitWidth large.
16995 /// \return Result is \p BitWidth and has used bits set to 1 and
16996 /// not used bits set to 0.
16997 APInt getUsedBits() const {
16998 // Reproduce the trunc(lshr) sequence:
16999 // - Start from the truncated value.
17000 // - Zero extend to the desired bit width.
17002 assert(Origin && "No original load to compare against.");
17003 unsigned BitWidth = Origin->getValueSizeInBits(0);
17004 assert(Inst && "This slice is not bound to an instruction");
17005 assert(Inst->getValueSizeInBits(0) <= BitWidth &&
17006 "Extracted slice is bigger than the whole type!");
17007 APInt UsedBits(Inst->getValueSizeInBits(0), 0);
17008 UsedBits.setAllBits();
17009 UsedBits = UsedBits.zext(BitWidth);
17010 UsedBits <<= Shift;
17014 /// Get the size of the slice to be loaded in bytes.
17015 unsigned getLoadedSize() const {
17016 unsigned SliceSize = getUsedBits().countPopulation();
17017 assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
17018 return SliceSize / 8;
17021 /// Get the type that will be loaded for this slice.
17022 /// Note: This may not be the final type for the slice.
17023 EVT getLoadedType() const {
17024 assert(DAG && "Missing context");
17025 LLVMContext &Ctxt = *DAG->getContext();
17026 return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
17029 /// Get the alignment of the load used for this slice.
17030 Align getAlign() const {
17031 Align Alignment = Origin->getAlign();
17032 uint64_t Offset = getOffsetFromBase();
17034 Alignment = commonAlignment(Alignment, Alignment.value() + Offset);
17038 /// Check if this slice can be rewritten with legal operations.
17039 bool isLegal() const {
17040 // An invalid slice is not legal.
17041 if (!Origin || !Inst || !DAG)
17044 // Offsets are for indexed load only, we do not handle that.
17045 if (!Origin->getOffset().isUndef())
17048 const TargetLowering &TLI = DAG->getTargetLoweringInfo();
17050 // Check that the type is legal.
17051 EVT SliceType = getLoadedType();
17052 if (!TLI.isTypeLegal(SliceType))
17055 // Check that the load is legal for this type.
17056 if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
17059 // Check that the offset can be computed.
17060 // 1. Check its type.
17061 EVT PtrType = Origin->getBasePtr().getValueType();
17062 if (PtrType == MVT::Untyped || PtrType.isExtended())
17065 // 2. Check that it fits in the immediate.
17066 if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
17069 // 3. Check that the computation is legal.
17070 if (!TLI.isOperationLegal(ISD::ADD, PtrType))
17073 // Check that the zext is legal if it needs one.
17074 EVT TruncateType = Inst->getValueType(0);
17075 if (TruncateType != SliceType &&
17076 !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
17082 /// Get the offset in bytes of this slice in the original chunk of
17084 /// \pre DAG != nullptr.
17085 uint64_t getOffsetFromBase() const {
17086 assert(DAG && "Missing context.");
17087 bool IsBigEndian = DAG->getDataLayout().isBigEndian();
17088 assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
17089 uint64_t Offset = Shift / 8;
17090 unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
17091 assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
17092 "The size of the original loaded type is not a multiple of a"
17094 // If Offset is bigger than TySizeInBytes, it means we are loading all
17095 // zeros. This should have been optimized before in the process.
17096 assert(TySizeInBytes > Offset &&
17097 "Invalid shift amount for given loaded size");
17099 Offset = TySizeInBytes - Offset - getLoadedSize();
17103 /// Generate the sequence of instructions to load the slice
17104 /// represented by this object and redirect the uses of this slice to
17105 /// this new sequence of instructions.
17106 /// \pre this->Inst && this->Origin are valid Instructions and this
17107 /// object passed the legal check: LoadedSlice::isLegal returned true.
17108 /// \return The last instruction of the sequence used to load the slice.
17109 SDValue loadSlice() const {
17110 assert(Inst && Origin && "Unable to replace a non-existing slice.");
17111 const SDValue &OldBaseAddr = Origin->getBasePtr();
17112 SDValue BaseAddr = OldBaseAddr;
17113 // Get the offset in that chunk of bytes w.r.t. the endianness.
17114 int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
17115 assert(Offset >= 0 && "Offset too big to fit in int64_t!");
17117 // BaseAddr = BaseAddr + Offset.
17118 EVT ArithType = BaseAddr.getValueType();
17120 BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
17121 DAG->getConstant(Offset, DL, ArithType));
17124 // Create the type of the loaded slice according to its size.
17125 EVT SliceType = getLoadedType();
17127 // Create the load for the slice.
17129 DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
17130 Origin->getPointerInfo().getWithOffset(Offset), getAlign(),
17131 Origin->getMemOperand()->getFlags());
17132 // If the final type is not the same as the loaded type, this means that
17133 // we have to pad with zero. Create a zero extend for that.
17134 EVT FinalType = Inst->getValueType(0);
17135 if (SliceType != FinalType)
17137 DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
17141 /// Check if this slice can be merged with an expensive cross register
17142 /// bank copy. E.g.,
17144 /// f = bitcast i32 i to float
17145 bool canMergeExpensiveCrossRegisterBankCopy() const {
17146 if (!Inst || !Inst->hasOneUse())
17148 SDNode *Use = *Inst->use_begin();
17149 if (Use->getOpcode() != ISD::BITCAST)
17151 assert(DAG && "Missing context");
17152 const TargetLowering &TLI = DAG->getTargetLoweringInfo();
17153 EVT ResVT = Use->getValueType(0);
17154 const TargetRegisterClass *ResRC =
17155 TLI.getRegClassFor(ResVT.getSimpleVT(), Use->isDivergent());
17156 const TargetRegisterClass *ArgRC =
17157 TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT(),
17158 Use->getOperand(0)->isDivergent());
17159 if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
17162 // At this point, we know that we perform a cross-register-bank copy.
17163 // Check if it is expensive.
17164 const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
17165 // Assume bitcasts are cheap, unless both register classes do not
17166 // explicitly share a common sub class.
17167 if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
17170 // Check if it will be merged with the load.
17171 // 1. Check the alignment / fast memory access constraint.
17172 bool IsFast = false;
17173 if (!TLI.allowsMemoryAccess(*DAG->getContext(), DAG->getDataLayout(), ResVT,
17174 Origin->getAddressSpace(), getAlign(),
17175 Origin->getMemOperand()->getFlags(), &IsFast) ||
17179 // 2. Check that the load is a legal operation for that type.
17180 if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
17183 // 3. Check that we do not have a zext in the way.
17184 if (Inst->getValueType(0) != getLoadedType())
17191 } // end anonymous namespace
17193 /// Check that all bits set in \p UsedBits form a dense region, i.e.,
17194 /// \p UsedBits looks like 0..0 1..1 0..0.
17195 static bool areUsedBitsDense(const APInt &UsedBits) {
17196 // If all the bits are one, this is dense!
17197 if (UsedBits.isAllOnes())
17200 // Get rid of the unused bits on the right.
17201 APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
17202 // Get rid of the unused bits on the left.
17203 if (NarrowedUsedBits.countLeadingZeros())
17204 NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
17205 // Check that the chunk of bits is completely used.
17206 return NarrowedUsedBits.isAllOnes();
17209 /// Check whether or not \p First and \p Second are next to each other
17210 /// in memory. This means that there is no hole between the bits loaded
17211 /// by \p First and the bits loaded by \p Second.
17212 static bool areSlicesNextToEachOther(const LoadedSlice &First,
17213 const LoadedSlice &Second) {
17214 assert(First.Origin == Second.Origin && First.Origin &&
17215 "Unable to match different memory origins.");
17216 APInt UsedBits = First.getUsedBits();
17217 assert((UsedBits & Second.getUsedBits()) == 0 &&
17218 "Slices are not supposed to overlap.");
17219 UsedBits |= Second.getUsedBits();
17220 return areUsedBitsDense(UsedBits);
17223 /// Adjust the \p GlobalLSCost according to the target
17224 /// paring capabilities and the layout of the slices.
17225 /// \pre \p GlobalLSCost should account for at least as many loads as
17226 /// there is in the slices in \p LoadedSlices.
17227 static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
17228 LoadedSlice::Cost &GlobalLSCost) {
17229 unsigned NumberOfSlices = LoadedSlices.size();
17230 // If there is less than 2 elements, no pairing is possible.
17231 if (NumberOfSlices < 2)
17234 // Sort the slices so that elements that are likely to be next to each
17235 // other in memory are next to each other in the list.
17236 llvm::sort(LoadedSlices, [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
17237 assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
17238 return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
17240 const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
17241 // First (resp. Second) is the first (resp. Second) potentially candidate
17242 // to be placed in a paired load.
17243 const LoadedSlice *First = nullptr;
17244 const LoadedSlice *Second = nullptr;
17245 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
17246 // Set the beginning of the pair.
17248 Second = &LoadedSlices[CurrSlice];
17250 // If First is NULL, it means we start a new pair.
17251 // Get to the next slice.
17255 EVT LoadedType = First->getLoadedType();
17257 // If the types of the slices are different, we cannot pair them.
17258 if (LoadedType != Second->getLoadedType())
17261 // Check if the target supplies paired loads for this type.
17262 Align RequiredAlignment;
17263 if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
17264 // move to the next pair, this type is hopeless.
17268 // Check if we meet the alignment requirement.
17269 if (First->getAlign() < RequiredAlignment)
17272 // Check that both loads are next to each other in memory.
17273 if (!areSlicesNextToEachOther(*First, *Second))
17276 assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
17277 --GlobalLSCost.Loads;
17278 // Move to the next pair.
17283 /// Check the profitability of all involved LoadedSlice.
17284 /// Currently, it is considered profitable if there is exactly two
17285 /// involved slices (1) which are (2) next to each other in memory, and
17286 /// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
17288 /// Note: The order of the elements in \p LoadedSlices may be modified, but not
17289 /// the elements themselves.
17291 /// FIXME: When the cost model will be mature enough, we can relax
17292 /// constraints (1) and (2).
17293 static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
17294 const APInt &UsedBits, bool ForCodeSize) {
17295 unsigned NumberOfSlices = LoadedSlices.size();
17296 if (StressLoadSlicing)
17297 return NumberOfSlices > 1;
17300 if (NumberOfSlices != 2)
17304 if (!areUsedBitsDense(UsedBits))
17308 LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
17309 // The original code has one big load.
17310 OrigCost.Loads = 1;
17311 for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
17312 const LoadedSlice &LS = LoadedSlices[CurrSlice];
17313 // Accumulate the cost of all the slices.
17314 LoadedSlice::Cost SliceCost(LS, ForCodeSize);
17315 GlobalSlicingCost += SliceCost;
17317 // Account as cost in the original configuration the gain obtained
17318 // with the current slices.
17319 OrigCost.addSliceGain(LS);
17322 // If the target supports paired load, adjust the cost accordingly.
17323 adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
17324 return OrigCost > GlobalSlicingCost;
17327 /// If the given load, \p LI, is used only by trunc or trunc(lshr)
17328 /// operations, split it in the various pieces being extracted.
17330 /// This sort of thing is introduced by SROA.
17331 /// This slicing takes care not to insert overlapping loads.
17332 /// \pre LI is a simple load (i.e., not an atomic or volatile load).
17333 bool DAGCombiner::SliceUpLoad(SDNode *N) {
17334 if (Level < AfterLegalizeDAG)
17337 LoadSDNode *LD = cast<LoadSDNode>(N);
17338 if (!LD->isSimple() || !ISD::isNormalLoad(LD) ||
17339 !LD->getValueType(0).isInteger())
17342 // The algorithm to split up a load of a scalable vector into individual
17343 // elements currently requires knowing the length of the loaded type,
17344 // so will need adjusting to work on scalable vectors.
17345 if (LD->getValueType(0).isScalableVector())
17348 // Keep track of already used bits to detect overlapping values.
17349 // In that case, we will just abort the transformation.
17350 APInt UsedBits(LD->getValueSizeInBits(0), 0);
17352 SmallVector<LoadedSlice, 4> LoadedSlices;
17354 // Check if this load is used as several smaller chunks of bits.
17355 // Basically, look for uses in trunc or trunc(lshr) and record a new chain
17356 // of computation for each trunc.
17357 for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
17358 UI != UIEnd; ++UI) {
17359 // Skip the uses of the chain.
17360 if (UI.getUse().getResNo() != 0)
17363 SDNode *User = *UI;
17364 unsigned Shift = 0;
17366 // Check if this is a trunc(lshr).
17367 if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
17368 isa<ConstantSDNode>(User->getOperand(1))) {
17369 Shift = User->getConstantOperandVal(1);
17370 User = *User->use_begin();
17373 // At this point, User is a Truncate, iff we encountered, trunc or
17375 if (User->getOpcode() != ISD::TRUNCATE)
17378 // The width of the type must be a power of 2 and greater than 8-bits.
17379 // Otherwise the load cannot be represented in LLVM IR.
17380 // Moreover, if we shifted with a non-8-bits multiple, the slice
17381 // will be across several bytes. We do not support that.
17382 unsigned Width = User->getValueSizeInBits(0);
17383 if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
17386 // Build the slice for this chain of computations.
17387 LoadedSlice LS(User, LD, Shift, &DAG);
17388 APInt CurrentUsedBits = LS.getUsedBits();
17390 // Check if this slice overlaps with another.
17391 if ((CurrentUsedBits & UsedBits) != 0)
17393 // Update the bits used globally.
17394 UsedBits |= CurrentUsedBits;
17396 // Check if the new slice would be legal.
17400 // Record the slice.
17401 LoadedSlices.push_back(LS);
17404 // Abort slicing if it does not seem to be profitable.
17405 if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
17410 // Rewrite each chain to use an independent load.
17411 // By construction, each chain can be represented by a unique load.
17413 // Prepare the argument for the new token factor for all the slices.
17414 SmallVector<SDValue, 8> ArgChains;
17415 for (const LoadedSlice &LS : LoadedSlices) {
17416 SDValue SliceInst = LS.loadSlice();
17417 CombineTo(LS.Inst, SliceInst, true);
17418 if (SliceInst.getOpcode() != ISD::LOAD)
17419 SliceInst = SliceInst.getOperand(0);
17420 assert(SliceInst->getOpcode() == ISD::LOAD &&
17421 "It takes more than a zext to get to the loaded slice!!");
17422 ArgChains.push_back(SliceInst.getValue(1));
17425 SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
17427 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
17428 AddToWorklist(Chain.getNode());
17432 /// Check to see if V is (and load (ptr), imm), where the load is having
17433 /// specific bytes cleared out. If so, return the byte size being masked out
17434 /// and the shift amount.
17435 static std::pair<unsigned, unsigned>
17436 CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
17437 std::pair<unsigned, unsigned> Result(0, 0);
17439 // Check for the structure we're looking for.
17440 if (V->getOpcode() != ISD::AND ||
17441 !isa<ConstantSDNode>(V->getOperand(1)) ||
17442 !ISD::isNormalLoad(V->getOperand(0).getNode()))
17445 // Check the chain and pointer.
17446 LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
17447 if (LD->getBasePtr() != Ptr) return Result; // Not from same pointer.
17449 // This only handles simple types.
17450 if (V.getValueType() != MVT::i16 &&
17451 V.getValueType() != MVT::i32 &&
17452 V.getValueType() != MVT::i64)
17455 // Check the constant mask. Invert it so that the bits being masked out are
17456 // 0 and the bits being kept are 1. Use getSExtValue so that leading bits
17457 // follow the sign bit for uniformity.
17458 uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
17459 unsigned NotMaskLZ = countLeadingZeros(NotMask);
17460 if (NotMaskLZ & 7) return Result; // Must be multiple of a byte.
17461 unsigned NotMaskTZ = countTrailingZeros(NotMask);
17462 if (NotMaskTZ & 7) return Result; // Must be multiple of a byte.
17463 if (NotMaskLZ == 64) return Result; // All zero mask.
17465 // See if we have a continuous run of bits. If so, we have 0*1+0*
17466 if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
17469 // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
17470 if (V.getValueType() != MVT::i64 && NotMaskLZ)
17471 NotMaskLZ -= 64-V.getValueSizeInBits();
17473 unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
17474 switch (MaskedBytes) {
17478 default: return Result; // All one mask, or 5-byte mask.
17481 // Verify that the first bit starts at a multiple of mask so that the access
17482 // is aligned the same as the access width.
17483 if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
17485 // For narrowing to be valid, it must be the case that the load the
17486 // immediately preceding memory operation before the store.
17487 if (LD == Chain.getNode())
17489 else if (Chain->getOpcode() == ISD::TokenFactor &&
17490 SDValue(LD, 1).hasOneUse()) {
17491 // LD has only 1 chain use so they are no indirect dependencies.
17492 if (!LD->isOperandOf(Chain.getNode()))
17495 return Result; // Fail.
17497 Result.first = MaskedBytes;
17498 Result.second = NotMaskTZ/8;
17502 /// Check to see if IVal is something that provides a value as specified by
17503 /// MaskInfo. If so, replace the specified store with a narrower store of
17504 /// truncated IVal.
17506 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
17507 SDValue IVal, StoreSDNode *St,
17509 unsigned NumBytes = MaskInfo.first;
17510 unsigned ByteShift = MaskInfo.second;
17511 SelectionDAG &DAG = DC->getDAG();
17513 // Check to see if IVal is all zeros in the part being masked in by the 'or'
17514 // that uses this. If not, this is not a replacement.
17515 APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
17516 ByteShift*8, (ByteShift+NumBytes)*8);
17517 if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue();
17519 // Check that it is legal on the target to do this. It is legal if the new
17520 // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
17521 // legalization. If the source type is legal, but the store type isn't, see
17522 // if we can use a truncating store.
17523 MVT VT = MVT::getIntegerVT(NumBytes * 8);
17524 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17525 bool UseTruncStore;
17526 if (DC->isTypeLegal(VT))
17527 UseTruncStore = false;
17528 else if (TLI.isTypeLegal(IVal.getValueType()) &&
17529 TLI.isTruncStoreLegal(IVal.getValueType(), VT))
17530 UseTruncStore = true;
17533 // Check that the target doesn't think this is a bad idea.
17534 if (St->getMemOperand() &&
17535 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
17536 *St->getMemOperand()))
17539 // Okay, we can do this! Replace the 'St' store with a store of IVal that is
17540 // shifted by ByteShift and truncated down to NumBytes.
17543 IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
17544 DAG.getConstant(ByteShift*8, DL,
17545 DC->getShiftAmountTy(IVal.getValueType())));
17548 // Figure out the offset for the store and the alignment of the access.
17550 if (DAG.getDataLayout().isLittleEndian())
17551 StOffset = ByteShift;
17553 StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
17555 SDValue Ptr = St->getBasePtr();
17558 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(StOffset), DL);
17563 return DAG.getTruncStore(St->getChain(), SDLoc(St), IVal, Ptr,
17564 St->getPointerInfo().getWithOffset(StOffset),
17565 VT, St->getOriginalAlign());
17567 // Truncate down to the new size.
17568 IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
17571 .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
17572 St->getPointerInfo().getWithOffset(StOffset),
17573 St->getOriginalAlign());
17576 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
17577 /// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
17578 /// narrowing the load and store if it would end up being a win for performance
17580 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
17581 StoreSDNode *ST = cast<StoreSDNode>(N);
17582 if (!ST->isSimple())
17585 SDValue Chain = ST->getChain();
17586 SDValue Value = ST->getValue();
17587 SDValue Ptr = ST->getBasePtr();
17588 EVT VT = Value.getValueType();
17590 if (ST->isTruncatingStore() || VT.isVector())
17593 unsigned Opc = Value.getOpcode();
17595 if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
17596 !Value.hasOneUse())
17599 // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
17600 // is a byte mask indicating a consecutive number of bytes, check to see if
17601 // Y is known to provide just those bytes. If so, we try to replace the
17602 // load + replace + store sequence with a single (narrower) store, which makes
17604 if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {
17605 std::pair<unsigned, unsigned> MaskedLoad;
17606 MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
17607 if (MaskedLoad.first)
17608 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
17609 Value.getOperand(1), ST,this))
17612 // Or is commutative, so try swapping X and Y.
17613 MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
17614 if (MaskedLoad.first)
17615 if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
17616 Value.getOperand(0), ST,this))
17620 if (!EnableReduceLoadOpStoreWidth)
17623 if (Value.getOperand(1).getOpcode() != ISD::Constant)
17626 SDValue N0 = Value.getOperand(0);
17627 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
17628 Chain == SDValue(N0.getNode(), 1)) {
17629 LoadSDNode *LD = cast<LoadSDNode>(N0);
17630 if (LD->getBasePtr() != Ptr ||
17631 LD->getPointerInfo().getAddrSpace() !=
17632 ST->getPointerInfo().getAddrSpace())
17635 // Find the type to narrow it the load / op / store to.
17636 SDValue N1 = Value.getOperand(1);
17637 unsigned BitWidth = N1.getValueSizeInBits();
17638 APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
17639 if (Opc == ISD::AND)
17640 Imm ^= APInt::getAllOnes(BitWidth);
17641 if (Imm == 0 || Imm.isAllOnes())
17643 unsigned ShAmt = Imm.countTrailingZeros();
17644 unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
17645 unsigned NewBW = NextPowerOf2(MSB - ShAmt);
17646 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
17647 // The narrowing should be profitable, the load/store operation should be
17648 // legal (or custom) and the store size should be equal to the NewVT width.
17649 while (NewBW < BitWidth &&
17650 (NewVT.getStoreSizeInBits() != NewBW ||
17651 !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
17652 !TLI.isNarrowingProfitable(VT, NewVT))) {
17653 NewBW = NextPowerOf2(NewBW);
17654 NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
17656 if (NewBW >= BitWidth)
17659 // If the lsb changed does not start at the type bitwidth boundary,
17660 // start at the previous one.
17662 ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
17663 APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
17664 std::min(BitWidth, ShAmt + NewBW));
17665 if ((Imm & Mask) == Imm) {
17666 APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
17667 if (Opc == ISD::AND)
17668 NewImm ^= APInt::getAllOnes(NewBW);
17669 uint64_t PtrOff = ShAmt / 8;
17670 // For big endian targets, we need to adjust the offset to the pointer to
17671 // load the correct bytes.
17672 if (DAG.getDataLayout().isBigEndian())
17673 PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
17675 bool IsFast = false;
17676 Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
17677 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
17678 LD->getAddressSpace(), NewAlign,
17679 LD->getMemOperand()->getFlags(), &IsFast) ||
17684 DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(PtrOff), SDLoc(LD));
17686 DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
17687 LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
17688 LD->getMemOperand()->getFlags(), LD->getAAInfo());
17689 SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
17690 DAG.getConstant(NewImm, SDLoc(Value),
17693 DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
17694 ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
17696 AddToWorklist(NewPtr.getNode());
17697 AddToWorklist(NewLD.getNode());
17698 AddToWorklist(NewVal.getNode());
17699 WorklistRemover DeadNodes(*this);
17700 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
17709 /// For a given floating point load / store pair, if the load value isn't used
17710 /// by any other operations, then consider transforming the pair to integer
17711 /// load / store operations if the target deems the transformation profitable.
17712 SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
17713 StoreSDNode *ST = cast<StoreSDNode>(N);
17714 SDValue Value = ST->getValue();
17715 if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
17716 Value.hasOneUse()) {
17717 LoadSDNode *LD = cast<LoadSDNode>(Value);
17718 EVT VT = LD->getMemoryVT();
17719 if (!VT.isFloatingPoint() ||
17720 VT != ST->getMemoryVT() ||
17721 LD->isNonTemporal() ||
17722 ST->isNonTemporal() ||
17723 LD->getPointerInfo().getAddrSpace() != 0 ||
17724 ST->getPointerInfo().getAddrSpace() != 0)
17727 TypeSize VTSize = VT.getSizeInBits();
17729 // We don't know the size of scalable types at compile time so we cannot
17730 // create an integer of the equivalent size.
17731 if (VTSize.isScalable())
17734 bool FastLD = false, FastST = false;
17735 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize());
17736 if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
17737 !TLI.isOperationLegal(ISD::STORE, IntVT) ||
17738 !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
17739 !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT) ||
17740 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
17741 *LD->getMemOperand(), &FastLD) ||
17742 !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), IntVT,
17743 *ST->getMemOperand(), &FastST) ||
17744 !FastLD || !FastST)
17748 DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
17749 LD->getPointerInfo(), LD->getAlign());
17752 DAG.getStore(ST->getChain(), SDLoc(N), NewLD, ST->getBasePtr(),
17753 ST->getPointerInfo(), ST->getAlign());
17755 AddToWorklist(NewLD.getNode());
17756 AddToWorklist(NewST.getNode());
17757 WorklistRemover DeadNodes(*this);
17758 DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
17766 // This is a helper function for visitMUL to check the profitability
17767 // of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
17768 // MulNode is the original multiply, AddNode is (add x, c1),
17769 // and ConstNode is c2.
17771 // If the (add x, c1) has multiple uses, we could increase
17772 // the number of adds if we make this transformation.
17773 // It would only be worth doing this if we can remove a
17774 // multiply in the process. Check for that here.
17778 // We're checking for cases where we have common "c3 * A" expressions.
17779 bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode, SDValue AddNode,
17780 SDValue ConstNode) {
17783 // If the add only has one use, and the target thinks the folding is
17784 // profitable or does not lead to worse code, this would be OK to do.
17785 if (AddNode->hasOneUse() &&
17786 TLI.isMulAddWithConstProfitable(AddNode, ConstNode))
17789 // Walk all the users of the constant with which we're multiplying.
17790 for (SDNode *Use : ConstNode->uses()) {
17791 if (Use == MulNode) // This use is the one we're on right now. Skip it.
17794 if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
17796 SDNode *MulVar = AddNode.getOperand(0).getNode();
17798 // OtherOp is what we're multiplying against the constant.
17799 if (Use->getOperand(0) == ConstNode)
17800 OtherOp = Use->getOperand(1).getNode();
17802 OtherOp = Use->getOperand(0).getNode();
17804 // Check to see if multiply is with the same operand of our "add".
17806 // ConstNode = CONST
17807 // Use = ConstNode * A <-- visiting Use. OtherOp is A.
17809 // AddNode = (A + c1) <-- MulVar is A.
17810 // = AddNode * ConstNode <-- current visiting instruction.
17812 // If we make this transformation, we will have a common
17813 // multiply (ConstNode * A) that we can save.
17814 if (OtherOp == MulVar)
17817 // Now check to see if a future expansion will give us a common
17820 // ConstNode = CONST
17821 // AddNode = (A + c1)
17822 // ... = AddNode * ConstNode <-- current visiting instruction.
17824 // OtherOp = (A + c2)
17825 // Use = OtherOp * ConstNode <-- visiting Use.
17827 // If we make this transformation, we will have a common
17828 // multiply (CONST * A) after we also do the same transformation
17829 // to the "t2" instruction.
17830 if (OtherOp->getOpcode() == ISD::ADD &&
17831 DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
17832 OtherOp->getOperand(0).getNode() == MulVar)
17837 // Didn't find a case where this would be profitable.
17841 SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
17842 unsigned NumStores) {
17843 SmallVector<SDValue, 8> Chains;
17844 SmallPtrSet<const SDNode *, 8> Visited;
17845 SDLoc StoreDL(StoreNodes[0].MemNode);
17847 for (unsigned i = 0; i < NumStores; ++i) {
17848 Visited.insert(StoreNodes[i].MemNode);
17851 // don't include nodes that are children or repeated nodes.
17852 for (unsigned i = 0; i < NumStores; ++i) {
17853 if (Visited.insert(StoreNodes[i].MemNode->getChain().getNode()).second)
17854 Chains.push_back(StoreNodes[i].MemNode->getChain());
17857 assert(Chains.size() > 0 && "Chain should have generated a chain");
17858 return DAG.getTokenFactor(StoreDL, Chains);
17861 bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
17862 SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
17863 bool IsConstantSrc, bool UseVector, bool UseTrunc) {
17864 // Make sure we have something to merge.
17868 assert((!UseTrunc || !UseVector) &&
17869 "This optimization cannot emit a vector truncating store");
17871 // The latest Node in the DAG.
17872 SDLoc DL(StoreNodes[0].MemNode);
17874 TypeSize ElementSizeBits = MemVT.getStoreSizeInBits();
17875 unsigned SizeInBits = NumStores * ElementSizeBits;
17876 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
17878 Optional<MachineMemOperand::Flags> Flags;
17880 for (unsigned I = 0; I != NumStores; ++I) {
17881 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
17883 Flags = St->getMemOperand()->getFlags();
17884 AAInfo = St->getAAInfo();
17887 // Skip merging if there's an inconsistent flag.
17888 if (Flags != St->getMemOperand()->getFlags())
17890 // Concatenate AA metadata.
17891 AAInfo = AAInfo.concat(St->getAAInfo());
17896 unsigned Elts = NumStores * NumMemElts;
17897 // Get the type for the merged vector store.
17898 StoreTy = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
17900 StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
17904 if (IsConstantSrc) {
17905 SmallVector<SDValue, 8> BuildVector;
17906 for (unsigned I = 0; I != NumStores; ++I) {
17907 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
17908 SDValue Val = St->getValue();
17909 // If constant is of the wrong type, convert it now.
17910 if (MemVT != Val.getValueType()) {
17911 Val = peekThroughBitcasts(Val);
17912 // Deal with constants of wrong size.
17913 if (ElementSizeBits != Val.getValueSizeInBits()) {
17915 EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
17916 if (isa<ConstantFPSDNode>(Val)) {
17917 // Not clear how to truncate FP values.
17921 if (auto *C = dyn_cast<ConstantSDNode>(Val))
17922 Val = DAG.getConstant(C->getAPIntValue()
17923 .zextOrTrunc(Val.getValueSizeInBits())
17924 .zextOrTrunc(ElementSizeBits),
17925 SDLoc(C), IntMemVT);
17927 // Make sure correctly size type is the correct type.
17928 Val = DAG.getBitcast(MemVT, Val);
17930 BuildVector.push_back(Val);
17932 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
17933 : ISD::BUILD_VECTOR,
17934 DL, StoreTy, BuildVector);
17936 SmallVector<SDValue, 8> Ops;
17937 for (unsigned i = 0; i < NumStores; ++i) {
17938 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
17939 SDValue Val = peekThroughBitcasts(St->getValue());
17940 // All operands of BUILD_VECTOR / CONCAT_VECTOR must be of
17941 // type MemVT. If the underlying value is not the correct
17942 // type, but it is an extraction of an appropriate vector we
17943 // can recast Val to be of the correct type. This may require
17944 // converting between EXTRACT_VECTOR_ELT and
17945 // EXTRACT_SUBVECTOR.
17946 if ((MemVT != Val.getValueType()) &&
17947 (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
17948 Val.getOpcode() == ISD::EXTRACT_SUBVECTOR)) {
17949 EVT MemVTScalarTy = MemVT.getScalarType();
17950 // We may need to add a bitcast here to get types to line up.
17951 if (MemVTScalarTy != Val.getValueType().getScalarType()) {
17952 Val = DAG.getBitcast(MemVT, Val);
17954 unsigned OpC = MemVT.isVector() ? ISD::EXTRACT_SUBVECTOR
17955 : ISD::EXTRACT_VECTOR_ELT;
17956 SDValue Vec = Val.getOperand(0);
17957 SDValue Idx = Val.getOperand(1);
17958 Val = DAG.getNode(OpC, SDLoc(Val), MemVT, Vec, Idx);
17961 Ops.push_back(Val);
17964 // Build the extracted vector elements back into a vector.
17965 StoredVal = DAG.getNode(MemVT.isVector() ? ISD::CONCAT_VECTORS
17966 : ISD::BUILD_VECTOR,
17970 // We should always use a vector store when merging extracted vector
17971 // elements, so this path implies a store of constants.
17972 assert(IsConstantSrc && "Merged vector elements should use vector store");
17974 APInt StoreInt(SizeInBits, 0);
17976 // Construct a single integer constant which is made of the smaller
17977 // constant inputs.
17978 bool IsLE = DAG.getDataLayout().isLittleEndian();
17979 for (unsigned i = 0; i < NumStores; ++i) {
17980 unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
17981 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
17983 SDValue Val = St->getValue();
17984 Val = peekThroughBitcasts(Val);
17985 StoreInt <<= ElementSizeBits;
17986 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
17987 StoreInt |= C->getAPIntValue()
17988 .zextOrTrunc(ElementSizeBits)
17989 .zextOrTrunc(SizeInBits);
17990 } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
17991 StoreInt |= C->getValueAPF()
17993 .zextOrTrunc(ElementSizeBits)
17994 .zextOrTrunc(SizeInBits);
17995 // If fp truncation is necessary give up for now.
17996 if (MemVT.getSizeInBits() != ElementSizeBits)
17999 llvm_unreachable("Invalid constant element type");
18003 // Create the new Load and Store operations.
18004 StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
18007 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18008 SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
18010 // make sure we use trunc store if it's necessary to be legal.
18013 NewStore = DAG.getStore(NewChain, DL, StoredVal, FirstInChain->getBasePtr(),
18014 FirstInChain->getPointerInfo(),
18015 FirstInChain->getAlign(), *Flags, AAInfo);
18016 } else { // Must be realized as a trunc store
18017 EVT LegalizedStoredValTy =
18018 TLI.getTypeToTransformTo(*DAG.getContext(), StoredVal.getValueType());
18019 unsigned LegalizedStoreSize = LegalizedStoredValTy.getSizeInBits();
18020 ConstantSDNode *C = cast<ConstantSDNode>(StoredVal);
18021 SDValue ExtendedStoreVal =
18022 DAG.getConstant(C->getAPIntValue().zextOrTrunc(LegalizedStoreSize), DL,
18023 LegalizedStoredValTy);
18024 NewStore = DAG.getTruncStore(
18025 NewChain, DL, ExtendedStoreVal, FirstInChain->getBasePtr(),
18026 FirstInChain->getPointerInfo(), StoredVal.getValueType() /*TVT*/,
18027 FirstInChain->getAlign(), *Flags, AAInfo);
18030 // Replace all merged stores with the new store.
18031 for (unsigned i = 0; i < NumStores; ++i)
18032 CombineTo(StoreNodes[i].MemNode, NewStore);
18034 AddToWorklist(NewChain.getNode());
18038 void DAGCombiner::getStoreMergeCandidates(
18039 StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes,
18040 SDNode *&RootNode) {
18041 // This holds the base pointer, index, and the offset in bytes from the base
18042 // pointer. We must have a base and an offset. Do not handle stores to undef
18044 BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
18045 if (!BasePtr.getBase().getNode() || BasePtr.getBase().isUndef())
18048 SDValue Val = peekThroughBitcasts(St->getValue());
18049 StoreSource StoreSrc = getStoreSource(Val);
18050 assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
18052 // Match on loadbaseptr if relevant.
18053 EVT MemVT = St->getMemoryVT();
18054 BaseIndexOffset LBasePtr;
18056 if (StoreSrc == StoreSource::Load) {
18057 auto *Ld = cast<LoadSDNode>(Val);
18058 LBasePtr = BaseIndexOffset::match(Ld, DAG);
18059 LoadVT = Ld->getMemoryVT();
18060 // Load and store should be the same type.
18061 if (MemVT != LoadVT)
18063 // Loads must only have one use.
18064 if (!Ld->hasNUsesOfValue(1, 0))
18066 // The memory operands must not be volatile/indexed/atomic.
18067 // TODO: May be able to relax for unordered atomics (see D66309)
18068 if (!Ld->isSimple() || Ld->isIndexed())
18071 auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
18072 int64_t &Offset) -> bool {
18073 // The memory operands must not be volatile/indexed/atomic.
18074 // TODO: May be able to relax for unordered atomics (see D66309)
18075 if (!Other->isSimple() || Other->isIndexed())
18077 // Don't mix temporal stores with non-temporal stores.
18078 if (St->isNonTemporal() != Other->isNonTemporal())
18080 SDValue OtherBC = peekThroughBitcasts(Other->getValue());
18081 // Allow merging constants of different types as integers.
18082 bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
18083 : Other->getMemoryVT() != MemVT;
18084 switch (StoreSrc) {
18085 case StoreSource::Load: {
18088 // The Load's Base Ptr must also match.
18089 auto *OtherLd = dyn_cast<LoadSDNode>(OtherBC);
18092 BaseIndexOffset LPtr = BaseIndexOffset::match(OtherLd, DAG);
18093 if (LoadVT != OtherLd->getMemoryVT())
18095 // Loads must only have one use.
18096 if (!OtherLd->hasNUsesOfValue(1, 0))
18098 // The memory operands must not be volatile/indexed/atomic.
18099 // TODO: May be able to relax for unordered atomics (see D66309)
18100 if (!OtherLd->isSimple() || OtherLd->isIndexed())
18102 // Don't mix temporal loads with non-temporal loads.
18103 if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
18105 if (!(LBasePtr.equalBaseIndex(LPtr, DAG)))
18109 case StoreSource::Constant:
18112 if (!isIntOrFPConstant(OtherBC))
18115 case StoreSource::Extract:
18116 // Do not merge truncated stores here.
18117 if (Other->isTruncatingStore())
18119 if (!MemVT.bitsEq(OtherBC.getValueType()))
18121 if (OtherBC.getOpcode() != ISD::EXTRACT_VECTOR_ELT &&
18122 OtherBC.getOpcode() != ISD::EXTRACT_SUBVECTOR)
18126 llvm_unreachable("Unhandled store source for merging");
18128 Ptr = BaseIndexOffset::match(Other, DAG);
18129 return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
18132 // Check if the pair of StoreNode and the RootNode already bail out many
18133 // times which is over the limit in dependence check.
18134 auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
18135 SDNode *RootNode) -> bool {
18136 auto RootCount = StoreRootCountMap.find(StoreNode);
18137 return RootCount != StoreRootCountMap.end() &&
18138 RootCount->second.first == RootNode &&
18139 RootCount->second.second > StoreMergeDependenceLimit;
18142 auto TryToAddCandidate = [&](SDNode::use_iterator UseIter) {
18143 // This must be a chain use.
18144 if (UseIter.getOperandNo() != 0)
18146 if (auto *OtherStore = dyn_cast<StoreSDNode>(*UseIter)) {
18147 BaseIndexOffset Ptr;
18149 if (CandidateMatch(OtherStore, Ptr, PtrDiff) &&
18150 !OverLimitInDependenceCheck(OtherStore, RootNode))
18151 StoreNodes.push_back(MemOpLink(OtherStore, PtrDiff));
18155 // We looking for a root node which is an ancestor to all mergable
18156 // stores. We search up through a load, to our root and then down
18157 // through all children. For instance we will find Store{1,2,3} if
18158 // St is Store1, Store2. or Store3 where the root is not a load
18159 // which always true for nonvolatile ops. TODO: Expand
18160 // the search to find all valid candidates through multiple layers of loads.
18163 // |-------|-------|
18164 // Load Load Store3
18168 // FIXME: We should be able to climb and
18169 // descend TokenFactors to find candidates as well.
18171 RootNode = St->getChain().getNode();
18173 unsigned NumNodesExplored = 0;
18174 const unsigned MaxSearchNodes = 1024;
18175 if (auto *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
18176 RootNode = Ldn->getChain().getNode();
18177 for (auto I = RootNode->use_begin(), E = RootNode->use_end();
18178 I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored) {
18179 if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) { // walk down chain
18180 for (auto I2 = (*I)->use_begin(), E2 = (*I)->use_end(); I2 != E2; ++I2)
18181 TryToAddCandidate(I2);
18183 // Check stores that depend on the root (e.g. Store 3 in the chart above).
18184 if (I.getOperandNo() == 0 && isa<StoreSDNode>(*I)) {
18185 TryToAddCandidate(I);
18189 for (auto I = RootNode->use_begin(), E = RootNode->use_end();
18190 I != E && NumNodesExplored < MaxSearchNodes; ++I, ++NumNodesExplored)
18191 TryToAddCandidate(I);
18195 // We need to check that merging these stores does not cause a loop in the
18196 // DAG. Any store candidate may depend on another candidate indirectly through
18197 // its operands. Check in parallel by searching up from operands of candidates.
18198 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
18199 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
18200 SDNode *RootNode) {
18201 // FIXME: We should be able to truncate a full search of
18202 // predecessors by doing a BFS and keeping tabs the originating
18203 // stores from which worklist nodes come from in a similar way to
18204 // TokenFactor simplfication.
18206 SmallPtrSet<const SDNode *, 32> Visited;
18207 SmallVector<const SDNode *, 8> Worklist;
18209 // RootNode is a predecessor to all candidates so we need not search
18210 // past it. Add RootNode (peeking through TokenFactors). Do not count
18211 // these towards size check.
18213 Worklist.push_back(RootNode);
18214 while (!Worklist.empty()) {
18215 auto N = Worklist.pop_back_val();
18216 if (!Visited.insert(N).second)
18217 continue; // Already present in Visited.
18218 if (N->getOpcode() == ISD::TokenFactor) {
18219 for (SDValue Op : N->ops())
18220 Worklist.push_back(Op.getNode());
18224 // Don't count pruning nodes towards max.
18225 unsigned int Max = 1024 + Visited.size();
18226 // Search Ops of store candidates.
18227 for (unsigned i = 0; i < NumStores; ++i) {
18228 SDNode *N = StoreNodes[i].MemNode;
18229 // Of the 4 Store Operands:
18230 // * Chain (Op 0) -> We have already considered these
18231 // in candidate selection, but only by following the
18232 // chain dependencies. We could still have a chain
18233 // dependency to a load, that has a non-chain dep to
18234 // another load, that depends on a store, etc. So it is
18235 // possible to have dependencies that consist of a mix
18236 // of chain and non-chain deps, and we need to include
18237 // chain operands in the analysis here..
18238 // * Value (Op 1) -> Cycles may happen (e.g. through load chains)
18239 // * Address (Op 2) -> Merged addresses may only vary by a fixed constant,
18240 // but aren't necessarily fromt the same base node, so
18241 // cycles possible (e.g. via indexed store).
18242 // * (Op 3) -> Represents the pre or post-indexing offset (or undef for
18243 // non-indexed stores). Not constant on all targets (e.g. ARM)
18244 // and so can participate in a cycle.
18245 for (unsigned j = 0; j < N->getNumOperands(); ++j)
18246 Worklist.push_back(N->getOperand(j).getNode());
18248 // Search through DAG. We can stop early if we find a store node.
18249 for (unsigned i = 0; i < NumStores; ++i)
18250 if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
18252 // If the searching bail out, record the StoreNode and RootNode in the
18253 // StoreRootCountMap. If we have seen the pair many times over a limit,
18254 // we won't add the StoreNode into StoreNodes set again.
18255 if (Visited.size() >= Max) {
18256 auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode];
18257 if (RootCount.first == RootNode)
18258 RootCount.second++;
18260 RootCount = {RootNode, 1};
18268 DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
18269 int64_t ElementSizeBytes) const {
18271 // Find a store past the width of the first store.
18272 size_t StartIdx = 0;
18273 while ((StartIdx + 1 < StoreNodes.size()) &&
18274 StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
18275 StoreNodes[StartIdx + 1].OffsetFromBase)
18278 // Bail if we don't have enough candidates to merge.
18279 if (StartIdx + 1 >= StoreNodes.size())
18282 // Trim stores that overlapped with the first store.
18284 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
18286 // Scan the memory operations on the chain and find the first
18287 // non-consecutive store memory address.
18288 unsigned NumConsecutiveStores = 1;
18289 int64_t StartAddress = StoreNodes[0].OffsetFromBase;
18290 // Check that the addresses are consecutive starting from the second
18291 // element in the list of stores.
18292 for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
18293 int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
18294 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
18296 NumConsecutiveStores = i + 1;
18298 if (NumConsecutiveStores > 1)
18299 return NumConsecutiveStores;
18301 // There are no consecutive stores at the start of the list.
18302 // Remove the first store and try again.
18303 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
18307 bool DAGCombiner::tryStoreMergeOfConstants(
18308 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
18309 EVT MemVT, SDNode *RootNode, bool AllowVectors) {
18310 LLVMContext &Context = *DAG.getContext();
18311 const DataLayout &DL = DAG.getDataLayout();
18312 int64_t ElementSizeBytes = MemVT.getStoreSize();
18313 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
18314 bool MadeChange = false;
18316 // Store the constants into memory as one consecutive store.
18317 while (NumConsecutiveStores >= 2) {
18318 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18319 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
18320 Align FirstStoreAlign = FirstInChain->getAlign();
18321 unsigned LastLegalType = 1;
18322 unsigned LastLegalVectorType = 1;
18323 bool LastIntegerTrunc = false;
18324 bool NonZero = false;
18325 unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
18326 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
18327 StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
18328 SDValue StoredVal = ST->getValue();
18329 bool IsElementZero = false;
18330 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
18331 IsElementZero = C->isZero();
18332 else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
18333 IsElementZero = C->getConstantFPValue()->isNullValue();
18334 if (IsElementZero) {
18335 if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
18336 FirstZeroAfterNonZero = i;
18338 NonZero |= !IsElementZero;
18340 // Find a legal type for the constant store.
18341 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
18342 EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
18343 bool IsFast = false;
18345 // Break early when size is too large to be legal.
18346 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
18349 if (TLI.isTypeLegal(StoreTy) &&
18350 TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
18351 DAG.getMachineFunction()) &&
18352 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18353 *FirstInChain->getMemOperand(), &IsFast) &&
18355 LastIntegerTrunc = false;
18356 LastLegalType = i + 1;
18357 // Or check whether a truncstore is legal.
18358 } else if (TLI.getTypeAction(Context, StoreTy) ==
18359 TargetLowering::TypePromoteInteger) {
18360 EVT LegalizedStoredValTy =
18361 TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
18362 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
18363 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
18364 DAG.getMachineFunction()) &&
18365 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18366 *FirstInChain->getMemOperand(), &IsFast) &&
18368 LastIntegerTrunc = true;
18369 LastLegalType = i + 1;
18373 // We only use vectors if the constant is known to be zero or the
18374 // target allows it and the function is not marked with the
18375 // noimplicitfloat attribute.
18377 TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
18379 // Find a legal type for the vector store.
18380 unsigned Elts = (i + 1) * NumMemElts;
18381 EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
18382 if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
18383 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
18384 TLI.allowsMemoryAccess(Context, DL, Ty,
18385 *FirstInChain->getMemOperand(), &IsFast) &&
18387 LastLegalVectorType = i + 1;
18391 bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors;
18392 unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
18393 bool UseTrunc = LastIntegerTrunc && !UseVector;
18395 // Check if we found a legal integer type that creates a meaningful
18398 // We know that candidate stores are in order and of correct
18399 // shape. While there is no mergeable sequence from the
18400 // beginning one may start later in the sequence. The only
18401 // reason a merge of size N could have failed where another of
18402 // the same size would not have, is if the alignment has
18403 // improved or we've dropped a non-zero value. Drop as many
18404 // candidates as we can here.
18405 unsigned NumSkip = 1;
18406 while ((NumSkip < NumConsecutiveStores) &&
18407 (NumSkip < FirstZeroAfterNonZero) &&
18408 (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
18411 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
18412 NumConsecutiveStores -= NumSkip;
18416 // Check that we can merge these candidates without causing a cycle.
18417 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
18419 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18420 NumConsecutiveStores -= NumElem;
18424 MadeChange |= mergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
18425 /*IsConstantSrc*/ true,
18426 UseVector, UseTrunc);
18428 // Remove merged stores for next iteration.
18429 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18430 NumConsecutiveStores -= NumElem;
18435 bool DAGCombiner::tryStoreMergeOfExtracts(
18436 SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
18437 EVT MemVT, SDNode *RootNode) {
18438 LLVMContext &Context = *DAG.getContext();
18439 const DataLayout &DL = DAG.getDataLayout();
18440 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
18441 bool MadeChange = false;
18443 // Loop on Consecutive Stores on success.
18444 while (NumConsecutiveStores >= 2) {
18445 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18446 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
18447 Align FirstStoreAlign = FirstInChain->getAlign();
18448 unsigned NumStoresToMerge = 1;
18449 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
18450 // Find a legal type for the vector store.
18451 unsigned Elts = (i + 1) * NumMemElts;
18452 EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
18453 bool IsFast = false;
18455 // Break early when size is too large to be legal.
18456 if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
18459 if (TLI.isTypeLegal(Ty) &&
18460 TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG.getMachineFunction()) &&
18461 TLI.allowsMemoryAccess(Context, DL, Ty,
18462 *FirstInChain->getMemOperand(), &IsFast) &&
18464 NumStoresToMerge = i + 1;
18467 // Check if we found a legal integer type creating a meaningful
18469 if (NumStoresToMerge < 2) {
18470 // We know that candidate stores are in order and of correct
18471 // shape. While there is no mergeable sequence from the
18472 // beginning one may start later in the sequence. The only
18473 // reason a merge of size N could have failed where another of
18474 // the same size would not have, is if the alignment has
18475 // improved. Drop as many candidates as we can here.
18476 unsigned NumSkip = 1;
18477 while ((NumSkip < NumConsecutiveStores) &&
18478 (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
18481 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
18482 NumConsecutiveStores -= NumSkip;
18486 // Check that we can merge these candidates without causing a cycle.
18487 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge,
18489 StoreNodes.erase(StoreNodes.begin(),
18490 StoreNodes.begin() + NumStoresToMerge);
18491 NumConsecutiveStores -= NumStoresToMerge;
18495 MadeChange |= mergeStoresOfConstantsOrVecElts(
18496 StoreNodes, MemVT, NumStoresToMerge, /*IsConstantSrc*/ false,
18497 /*UseVector*/ true, /*UseTrunc*/ false);
18499 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge);
18500 NumConsecutiveStores -= NumStoresToMerge;
18505 bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
18506 unsigned NumConsecutiveStores, EVT MemVT,
18507 SDNode *RootNode, bool AllowVectors,
18508 bool IsNonTemporalStore,
18509 bool IsNonTemporalLoad) {
18510 LLVMContext &Context = *DAG.getContext();
18511 const DataLayout &DL = DAG.getDataLayout();
18512 int64_t ElementSizeBytes = MemVT.getStoreSize();
18513 unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
18514 bool MadeChange = false;
18516 // Look for load nodes which are used by the stored values.
18517 SmallVector<MemOpLink, 8> LoadNodes;
18519 // Find acceptable loads. Loads need to have the same chain (token factor),
18520 // must not be zext, volatile, indexed, and they must be consecutive.
18521 BaseIndexOffset LdBasePtr;
18523 for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
18524 StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
18525 SDValue Val = peekThroughBitcasts(St->getValue());
18526 LoadSDNode *Ld = cast<LoadSDNode>(Val);
18528 BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
18529 // If this is not the first ptr that we check.
18530 int64_t LdOffset = 0;
18531 if (LdBasePtr.getBase().getNode()) {
18532 // The base ptr must be the same.
18533 if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
18536 // Check that all other base pointers are the same as this one.
18540 // We found a potential memory operand to merge.
18541 LoadNodes.push_back(MemOpLink(Ld, LdOffset));
18544 while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
18545 Align RequiredAlignment;
18546 bool NeedRotate = false;
18547 if (LoadNodes.size() == 2) {
18548 // If we have load/store pair instructions and we only have two values,
18549 // don't bother merging.
18550 if (TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
18551 StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) {
18552 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
18553 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
18556 // If the loads are reversed, see if we can rotate the halves into place.
18557 int64_t Offset0 = LoadNodes[0].OffsetFromBase;
18558 int64_t Offset1 = LoadNodes[1].OffsetFromBase;
18559 EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2);
18560 if (Offset0 - Offset1 == ElementSizeBytes &&
18561 (hasOperation(ISD::ROTL, PairVT) ||
18562 hasOperation(ISD::ROTR, PairVT))) {
18563 std::swap(LoadNodes[0], LoadNodes[1]);
18567 LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
18568 unsigned FirstStoreAS = FirstInChain->getAddressSpace();
18569 Align FirstStoreAlign = FirstInChain->getAlign();
18570 LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
18572 // Scan the memory operations on the chain and find the first
18573 // non-consecutive load memory address. These variables hold the index in
18574 // the store node array.
18576 unsigned LastConsecutiveLoad = 1;
18578 // This variable refers to the size and not index in the array.
18579 unsigned LastLegalVectorType = 1;
18580 unsigned LastLegalIntegerType = 1;
18581 bool isDereferenceable = true;
18582 bool DoIntegerTruncate = false;
18583 int64_t StartAddress = LoadNodes[0].OffsetFromBase;
18584 SDValue LoadChain = FirstLoad->getChain();
18585 for (unsigned i = 1; i < LoadNodes.size(); ++i) {
18586 // All loads must share the same chain.
18587 if (LoadNodes[i].MemNode->getChain() != LoadChain)
18590 int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
18591 if (CurrAddress - StartAddress != (ElementSizeBytes * i))
18593 LastConsecutiveLoad = i;
18595 if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
18596 isDereferenceable = false;
18598 // Find a legal type for the vector store.
18599 unsigned Elts = (i + 1) * NumMemElts;
18600 EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
18602 // Break early when size is too large to be legal.
18603 if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
18606 bool IsFastSt = false;
18607 bool IsFastLd = false;
18608 // Don't try vector types if we need a rotate. We may still fail the
18609 // legality checks for the integer type, but we can't handle the rotate
18610 // case with vectors.
18611 // FIXME: We could use a shuffle in place of the rotate.
18612 if (!NeedRotate && TLI.isTypeLegal(StoreTy) &&
18613 TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
18614 DAG.getMachineFunction()) &&
18615 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18616 *FirstInChain->getMemOperand(), &IsFastSt) &&
18618 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18619 *FirstLoad->getMemOperand(), &IsFastLd) &&
18621 LastLegalVectorType = i + 1;
18624 // Find a legal type for the integer store.
18625 unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
18626 StoreTy = EVT::getIntegerVT(Context, SizeInBits);
18627 if (TLI.isTypeLegal(StoreTy) &&
18628 TLI.canMergeStoresTo(FirstStoreAS, StoreTy,
18629 DAG.getMachineFunction()) &&
18630 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18631 *FirstInChain->getMemOperand(), &IsFastSt) &&
18633 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18634 *FirstLoad->getMemOperand(), &IsFastLd) &&
18636 LastLegalIntegerType = i + 1;
18637 DoIntegerTruncate = false;
18638 // Or check whether a truncstore and extload is legal.
18639 } else if (TLI.getTypeAction(Context, StoreTy) ==
18640 TargetLowering::TypePromoteInteger) {
18641 EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
18642 if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
18643 TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy,
18644 DAG.getMachineFunction()) &&
18645 TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&
18646 TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&
18647 TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
18648 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18649 *FirstInChain->getMemOperand(), &IsFastSt) &&
18651 TLI.allowsMemoryAccess(Context, DL, StoreTy,
18652 *FirstLoad->getMemOperand(), &IsFastLd) &&
18654 LastLegalIntegerType = i + 1;
18655 DoIntegerTruncate = true;
18660 // Only use vector types if the vector type is larger than the integer
18661 // type. If they are the same, use integers.
18663 LastLegalVectorType > LastLegalIntegerType && AllowVectors;
18664 unsigned LastLegalType =
18665 std::max(LastLegalVectorType, LastLegalIntegerType);
18667 // We add +1 here because the LastXXX variables refer to location while
18668 // the NumElem refers to array/index size.
18669 unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
18670 NumElem = std::min(LastLegalType, NumElem);
18671 Align FirstLoadAlign = FirstLoad->getAlign();
18674 // We know that candidate stores are in order and of correct
18675 // shape. While there is no mergeable sequence from the
18676 // beginning one may start later in the sequence. The only
18677 // reason a merge of size N could have failed where another of
18678 // the same size would not have is if the alignment or either
18679 // the load or store has improved. Drop as many candidates as we
18681 unsigned NumSkip = 1;
18682 while ((NumSkip < LoadNodes.size()) &&
18683 (LoadNodes[NumSkip].MemNode->getAlign() <= FirstLoadAlign) &&
18684 (StoreNodes[NumSkip].MemNode->getAlign() <= FirstStoreAlign))
18686 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
18687 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
18688 NumConsecutiveStores -= NumSkip;
18692 // Check that we can merge these candidates without causing a cycle.
18693 if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
18695 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18696 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
18697 NumConsecutiveStores -= NumElem;
18701 // Find if it is better to use vectors or integers to load and store
18705 // Find a legal type for the vector store.
18706 unsigned Elts = NumElem * NumMemElts;
18707 JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
18709 unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
18710 JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
18713 SDLoc LoadDL(LoadNodes[0].MemNode);
18714 SDLoc StoreDL(StoreNodes[0].MemNode);
18716 // The merged loads are required to have the same incoming chain, so
18717 // using the first's chain is acceptable.
18719 SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
18720 AddToWorklist(NewStoreChain.getNode());
18722 MachineMemOperand::Flags LdMMOFlags =
18723 isDereferenceable ? MachineMemOperand::MODereferenceable
18724 : MachineMemOperand::MONone;
18725 if (IsNonTemporalLoad)
18726 LdMMOFlags |= MachineMemOperand::MONonTemporal;
18728 MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
18729 ? MachineMemOperand::MONonTemporal
18730 : MachineMemOperand::MONone;
18732 SDValue NewLoad, NewStore;
18733 if (UseVectorTy || !DoIntegerTruncate) {
18734 NewLoad = DAG.getLoad(
18735 JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
18736 FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags);
18737 SDValue StoreOp = NewLoad;
18739 unsigned LoadWidth = ElementSizeBytes * 8 * 2;
18740 assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) &&
18741 "Unexpected type for rotate-able load pair");
18743 DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL);
18744 // Target can convert to the identical ROTR if it does not have ROTL.
18745 StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt);
18747 NewStore = DAG.getStore(
18748 NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(),
18749 FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);
18750 } else { // This must be the truncstore/extload case
18752 TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
18753 NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
18754 FirstLoad->getChain(), FirstLoad->getBasePtr(),
18755 FirstLoad->getPointerInfo(), JointMemOpVT,
18756 FirstLoadAlign, LdMMOFlags);
18757 NewStore = DAG.getTruncStore(
18758 NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
18759 FirstInChain->getPointerInfo(), JointMemOpVT,
18760 FirstInChain->getAlign(), FirstInChain->getMemOperand()->getFlags());
18763 // Transfer chain users from old loads to the new load.
18764 for (unsigned i = 0; i < NumElem; ++i) {
18765 LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
18766 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
18767 SDValue(NewLoad.getNode(), 1));
18770 // Replace all stores with the new store. Recursively remove corresponding
18771 // values if they are no longer used.
18772 for (unsigned i = 0; i < NumElem; ++i) {
18773 SDValue Val = StoreNodes[i].MemNode->getOperand(1);
18774 CombineTo(StoreNodes[i].MemNode, NewStore);
18775 if (Val->use_empty())
18776 recursivelyDeleteUnusedNodes(Val.getNode());
18780 StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
18781 LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
18782 NumConsecutiveStores -= NumElem;
18787 bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
18788 if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
18791 // TODO: Extend this function to merge stores of scalable vectors.
18792 // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8>
18793 // store since we know <vscale x 16 x i8> is exactly twice as large as
18794 // <vscale x 8 x i8>). Until then, bail out for scalable vectors.
18795 EVT MemVT = St->getMemoryVT();
18796 if (MemVT.isScalableVector())
18798 if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
18801 // This function cannot currently deal with non-byte-sized memory sizes.
18802 int64_t ElementSizeBytes = MemVT.getStoreSize();
18803 if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())
18806 // Do not bother looking at stored values that are not constants, loads, or
18807 // extracted vector elements.
18808 SDValue StoredVal = peekThroughBitcasts(St->getValue());
18809 const StoreSource StoreSrc = getStoreSource(StoredVal);
18810 if (StoreSrc == StoreSource::Unknown)
18813 SmallVector<MemOpLink, 8> StoreNodes;
18815 // Find potential store merge candidates by searching through chain sub-DAG
18816 getStoreMergeCandidates(St, StoreNodes, RootNode);
18818 // Check if there is anything to merge.
18819 if (StoreNodes.size() < 2)
18822 // Sort the memory operands according to their distance from the
18824 llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {
18825 return LHS.OffsetFromBase < RHS.OffsetFromBase;
18828 bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(
18829 Attribute::NoImplicitFloat);
18830 bool IsNonTemporalStore = St->isNonTemporal();
18831 bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&
18832 cast<LoadSDNode>(StoredVal)->isNonTemporal();
18834 // Store Merge attempts to merge the lowest stores. This generally
18835 // works out as if successful, as the remaining stores are checked
18836 // after the first collection of stores is merged. However, in the
18837 // case that a non-mergeable store is found first, e.g., {p[-2],
18838 // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
18839 // mergeable cases. To prevent this, we prune such stores from the
18840 // front of StoreNodes here.
18841 bool MadeChange = false;
18842 while (StoreNodes.size() > 1) {
18843 unsigned NumConsecutiveStores =
18844 getConsecutiveStores(StoreNodes, ElementSizeBytes);
18845 // There are no more stores in the list to examine.
18846 if (NumConsecutiveStores == 0)
18849 // We have at least 2 consecutive stores. Try to merge them.
18850 assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");
18851 switch (StoreSrc) {
18852 case StoreSource::Constant:
18853 MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,
18854 MemVT, RootNode, AllowVectors);
18857 case StoreSource::Extract:
18858 MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,
18862 case StoreSource::Load:
18863 MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,
18864 MemVT, RootNode, AllowVectors,
18865 IsNonTemporalStore, IsNonTemporalLoad);
18869 llvm_unreachable("Unhandled store source type");
18875 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
18879 // Replace the chain to avoid dependency.
18880 if (ST->isTruncatingStore()) {
18881 ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
18882 ST->getBasePtr(), ST->getMemoryVT(),
18883 ST->getMemOperand());
18885 ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
18886 ST->getMemOperand());
18889 // Create token to keep both nodes around.
18890 SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
18891 MVT::Other, ST->getChain(), ReplStore);
18893 // Make sure the new and old chains are cleaned up.
18894 AddToWorklist(Token.getNode());
18896 // Don't add users to work list.
18897 return CombineTo(ST, Token, false);
18900 SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
18901 SDValue Value = ST->getValue();
18902 if (Value.getOpcode() == ISD::TargetConstantFP)
18905 if (!ISD::isNormalStore(ST))
18910 SDValue Chain = ST->getChain();
18911 SDValue Ptr = ST->getBasePtr();
18913 const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);
18915 // NOTE: If the original store is volatile, this transform must not increase
18916 // the number of stores. For example, on x86-32 an f64 can be stored in one
18917 // processor operation but an i64 (which is not legal) requires two. So the
18918 // transform should not be done in this case.
18921 switch (CFP->getSimpleValueType(0).SimpleTy) {
18923 llvm_unreachable("Unknown FP type");
18924 case MVT::f16: // We don't do this for these yet.
18931 if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
18932 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
18933 Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
18934 bitcastToAPInt().getZExtValue(), SDLoc(CFP),
18936 return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
18941 if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
18943 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
18944 Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
18945 getZExtValue(), SDLoc(CFP), MVT::i64);
18946 return DAG.getStore(Chain, DL, Tmp,
18947 Ptr, ST->getMemOperand());
18950 if (ST->isSimple() &&
18951 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
18952 // Many FP stores are not made apparent until after legalize, e.g. for
18953 // argument passing. Since this is so common, custom legalize the
18954 // 64-bit integer store into two 32-bit stores.
18955 uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
18956 SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
18957 SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
18958 if (DAG.getDataLayout().isBigEndian())
18961 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
18962 AAMDNodes AAInfo = ST->getAAInfo();
18964 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
18965 ST->getOriginalAlign(), MMOFlags, AAInfo);
18966 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(4), DL);
18967 SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
18968 ST->getPointerInfo().getWithOffset(4),
18969 ST->getOriginalAlign(), MMOFlags, AAInfo);
18970 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
18978 SDValue DAGCombiner::visitSTORE(SDNode *N) {
18979 StoreSDNode *ST = cast<StoreSDNode>(N);
18980 SDValue Chain = ST->getChain();
18981 SDValue Value = ST->getValue();
18982 SDValue Ptr = ST->getBasePtr();
18984 // If this is a store of a bit convert, store the input value if the
18985 // resultant store does not need a higher alignment than the original.
18986 if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
18987 ST->isUnindexed()) {
18988 EVT SVT = Value.getOperand(0).getValueType();
18989 // If the store is volatile, we only want to change the store type if the
18990 // resulting store is legal. Otherwise we might increase the number of
18991 // memory accesses. We don't care if the original type was legal or not
18992 // as we assume software couldn't rely on the number of accesses of an
18994 // TODO: May be able to relax for unordered atomics (see D66309)
18995 if (((!LegalOperations && ST->isSimple()) ||
18996 TLI.isOperationLegal(ISD::STORE, SVT)) &&
18997 TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
18998 DAG, *ST->getMemOperand())) {
18999 return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
19000 ST->getMemOperand());
19004 // Turn 'store undef, Ptr' -> nothing.
19005 if (Value.isUndef() && ST->isUnindexed())
19008 // Try to infer better alignment information than the store already has.
19009 if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {
19010 if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
19011 if (*Alignment > ST->getAlign() &&
19012 isAligned(*Alignment, ST->getSrcValueOffset())) {
19014 DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
19015 ST->getMemoryVT(), *Alignment,
19016 ST->getMemOperand()->getFlags(), ST->getAAInfo());
19017 // NewStore will always be N as we are only refining the alignment
19018 assert(NewStore.getNode() == N);
19024 // Try transforming a pair floating point load / store ops to integer
19025 // load / store ops.
19026 if (SDValue NewST = TransformFPLoadStorePair(N))
19029 // Try transforming several stores into STORE (BSWAP).
19030 if (SDValue Store = mergeTruncStores(ST))
19033 if (ST->isUnindexed()) {
19034 // Walk up chain skipping non-aliasing memory nodes, on this store and any
19035 // adjacent stores.
19036 if (findBetterNeighborChains(ST)) {
19037 // replaceStoreChain uses CombineTo, which handled all of the worklist
19038 // manipulation. Return the original node to not do anything else.
19039 return SDValue(ST, 0);
19041 Chain = ST->getChain();
19044 // FIXME: is there such a thing as a truncating indexed store?
19045 if (ST->isTruncatingStore() && ST->isUnindexed() &&
19046 Value.getValueType().isInteger() &&
19047 (!isa<ConstantSDNode>(Value) ||
19048 !cast<ConstantSDNode>(Value)->isOpaque())) {
19049 // Convert a truncating store of a extension into a standard store.
19050 if ((Value.getOpcode() == ISD::ZERO_EXTEND ||
19051 Value.getOpcode() == ISD::SIGN_EXTEND ||
19052 Value.getOpcode() == ISD::ANY_EXTEND) &&
19053 Value.getOperand(0).getValueType() == ST->getMemoryVT() &&
19054 TLI.isOperationLegalOrCustom(ISD::STORE, ST->getMemoryVT()))
19055 return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
19056 ST->getMemOperand());
19058 APInt TruncDemandedBits =
19059 APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
19060 ST->getMemoryVT().getScalarSizeInBits());
19062 // See if we can simplify the operation with SimplifyDemandedBits, which
19063 // only works if the value has a single use.
19064 AddToWorklist(Value.getNode());
19065 if (SimplifyDemandedBits(Value, TruncDemandedBits)) {
19066 // Re-visit the store if anything changed and the store hasn't been merged
19067 // with another node (N is deleted) SimplifyDemandedBits will add Value's
19068 // node back to the worklist if necessary, but we also need to re-visit
19069 // the Store node itself.
19070 if (N->getOpcode() != ISD::DELETED_NODE)
19072 return SDValue(N, 0);
19075 // Otherwise, see if we can simplify the input to this truncstore with
19076 // knowledge that only the low bits are being used. For example:
19077 // "truncstore (or (shl x, 8), y), i8" -> "truncstore y, i8"
19078 if (SDValue Shorter =
19079 TLI.SimplifyMultipleUseDemandedBits(Value, TruncDemandedBits, DAG))
19080 return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
19081 ST->getMemOperand());
19083 // If we're storing a truncated constant, see if we can simplify it.
19084 // TODO: Move this to targetShrinkDemandedConstant?
19085 if (auto *Cst = dyn_cast<ConstantSDNode>(Value))
19086 if (!Cst->isOpaque()) {
19087 const APInt &CValue = Cst->getAPIntValue();
19088 APInt NewVal = CValue & TruncDemandedBits;
19089 if (NewVal != CValue) {
19091 DAG.getConstant(NewVal, SDLoc(N), Value.getValueType());
19092 return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr,
19093 ST->getMemoryVT(), ST->getMemOperand());
19098 // If this is a load followed by a store to the same location, then the store
19100 // TODO: Can relax for unordered atomics (see D66309)
19101 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
19102 if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
19103 ST->isUnindexed() && ST->isSimple() &&
19104 Ld->getAddressSpace() == ST->getAddressSpace() &&
19105 // There can't be any side effects between the load and store, such as
19106 // a call or store.
19107 Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
19108 // The store is dead, remove it.
19113 // TODO: Can relax for unordered atomics (see D66309)
19114 if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
19115 if (ST->isUnindexed() && ST->isSimple() &&
19116 ST1->isUnindexed() && ST1->isSimple()) {
19117 if (OptLevel != CodeGenOpt::None && ST1->getBasePtr() == Ptr &&
19118 ST1->getValue() == Value && ST->getMemoryVT() == ST1->getMemoryVT() &&
19119 ST->getAddressSpace() == ST1->getAddressSpace()) {
19120 // If this is a store followed by a store with the same value to the
19121 // same location, then the store is dead/noop.
19125 if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
19126 !ST1->getBasePtr().isUndef() &&
19127 // BaseIndexOffset and the code below requires knowing the size
19128 // of a vector, so bail out if MemoryVT is scalable.
19129 !ST->getMemoryVT().isScalableVector() &&
19130 !ST1->getMemoryVT().isScalableVector() &&
19131 ST->getAddressSpace() == ST1->getAddressSpace()) {
19132 const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
19133 const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
19134 unsigned STBitSize = ST->getMemoryVT().getFixedSizeInBits();
19135 unsigned ChainBitSize = ST1->getMemoryVT().getFixedSizeInBits();
19136 // If this is a store who's preceding store to a subset of the current
19137 // location and no one other node is chained to that store we can
19138 // effectively drop the store. Do not remove stores to undef as they may
19139 // be used as data sinks.
19140 if (STBase.contains(DAG, STBitSize, ChainBase, ChainBitSize)) {
19141 CombineTo(ST1, ST1->getChain());
19148 // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
19149 // truncating store. We can do this even if this is already a truncstore.
19150 if ((Value.getOpcode() == ISD::FP_ROUND ||
19151 Value.getOpcode() == ISD::TRUNCATE) &&
19152 Value->hasOneUse() && ST->isUnindexed() &&
19153 TLI.canCombineTruncStore(Value.getOperand(0).getValueType(),
19154 ST->getMemoryVT(), LegalOperations)) {
19155 return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
19156 Ptr, ST->getMemoryVT(), ST->getMemOperand());
19159 // Always perform this optimization before types are legal. If the target
19160 // prefers, also try this after legalization to catch stores that were created
19161 // by intrinsics or other nodes.
19162 if (!LegalTypes || (TLI.mergeStoresAfterLegalization(ST->getMemoryVT()))) {
19164 // There can be multiple store sequences on the same chain.
19165 // Keep trying to merge store sequences until we are unable to do so
19166 // or until we merge the last store on the chain.
19167 bool Changed = mergeConsecutiveStores(ST);
19168 if (!Changed) break;
19169 // Return N as merge only uses CombineTo and no worklist clean
19170 // up is necessary.
19171 if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
19172 return SDValue(N, 0);
19176 // Try transforming N to an indexed store.
19177 if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
19178 return SDValue(N, 0);
19180 // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
19182 // Make sure to do this only after attempting to merge stores in order to
19183 // avoid changing the types of some subset of stores due to visit order,
19184 // preventing their merging.
19185 if (isa<ConstantFPSDNode>(ST->getValue())) {
19186 if (SDValue NewSt = replaceStoreOfFPConstant(ST))
19190 if (SDValue NewSt = splitMergedValStore(ST))
19193 return ReduceLoadOpStoreWidth(N);
19196 SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
19197 const auto *LifetimeEnd = cast<LifetimeSDNode>(N);
19198 if (!LifetimeEnd->hasOffset())
19201 const BaseIndexOffset LifetimeEndBase(N->getOperand(1), SDValue(),
19202 LifetimeEnd->getOffset(), false);
19204 // We walk up the chains to find stores.
19205 SmallVector<SDValue, 8> Chains = {N->getOperand(0)};
19206 while (!Chains.empty()) {
19207 SDValue Chain = Chains.pop_back_val();
19208 if (!Chain.hasOneUse())
19210 switch (Chain.getOpcode()) {
19211 case ISD::TokenFactor:
19212 for (unsigned Nops = Chain.getNumOperands(); Nops;)
19213 Chains.push_back(Chain.getOperand(--Nops));
19215 case ISD::LIFETIME_START:
19216 case ISD::LIFETIME_END:
19217 // We can forward past any lifetime start/end that can be proven not to
19219 if (!mayAlias(Chain.getNode(), N))
19220 Chains.push_back(Chain.getOperand(0));
19223 StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
19224 // TODO: Can relax for unordered atomics (see D66309)
19225 if (!ST->isSimple() || ST->isIndexed())
19227 const TypeSize StoreSize = ST->getMemoryVT().getStoreSize();
19228 // The bounds of a scalable store are not known until runtime, so this
19229 // store cannot be elided.
19230 if (StoreSize.isScalable())
19232 const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
19233 // If we store purely within object bounds just before its lifetime ends,
19234 // we can remove the store.
19235 if (LifetimeEndBase.contains(DAG, LifetimeEnd->getSize() * 8, StoreBase,
19236 StoreSize.getFixedSize() * 8)) {
19237 LLVM_DEBUG(dbgs() << "\nRemoving store:"; StoreBase.dump();
19238 dbgs() << "\nwithin LIFETIME_END of : ";
19239 LifetimeEndBase.dump(); dbgs() << "\n");
19240 CombineTo(ST, ST->getChain());
19241 return SDValue(N, 0);
19249 /// For the instruction sequence of store below, F and I values
19250 /// are bundled together as an i64 value before being stored into memory.
19251 /// Sometimes it is more efficent to generate separate stores for F and I,
19252 /// which can remove the bitwise instructions or sink them to colder places.
19254 /// (store (or (zext (bitcast F to i32) to i64),
19255 /// (shl (zext I to i64), 32)), addr) -->
19256 /// (store F, addr) and (store I, addr+4)
19258 /// Similarly, splitting for other merged store can also be beneficial, like:
19259 /// For pair of {i32, i32}, i64 store --> two i32 stores.
19260 /// For pair of {i32, i16}, i64 store --> two i32 stores.
19261 /// For pair of {i16, i16}, i32 store --> two i16 stores.
19262 /// For pair of {i16, i8}, i32 store --> two i16 stores.
19263 /// For pair of {i8, i8}, i16 store --> two i8 stores.
19265 /// We allow each target to determine specifically which kind of splitting is
19268 /// The store patterns are commonly seen from the simple code snippet below
19269 /// if only std::make_pair(...) is sroa transformed before inlined into hoo.
19270 /// void goo(const std::pair<int, float> &);
19273 /// goo(std::make_pair(tmp, ftmp));
19277 SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
19278 if (OptLevel == CodeGenOpt::None)
19281 // Can't change the number of memory accesses for a volatile store or break
19282 // atomicity for an atomic one.
19283 if (!ST->isSimple())
19286 SDValue Val = ST->getValue();
19289 // Match OR operand.
19290 if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
19293 // Match SHL operand and get Lower and Higher parts of Val.
19294 SDValue Op1 = Val.getOperand(0);
19295 SDValue Op2 = Val.getOperand(1);
19297 if (Op1.getOpcode() != ISD::SHL) {
19298 std::swap(Op1, Op2);
19299 if (Op1.getOpcode() != ISD::SHL)
19303 Hi = Op1.getOperand(0);
19304 if (!Op1.hasOneUse())
19307 // Match shift amount to HalfValBitSize.
19308 unsigned HalfValBitSize = Val.getValueSizeInBits() / 2;
19309 ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
19310 if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
19313 // Lo and Hi are zero-extended from int with size less equal than 32
19315 if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
19316 !Lo.getOperand(0).getValueType().isScalarInteger() ||
19317 Lo.getOperand(0).getValueSizeInBits() > HalfValBitSize ||
19318 Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
19319 !Hi.getOperand(0).getValueType().isScalarInteger() ||
19320 Hi.getOperand(0).getValueSizeInBits() > HalfValBitSize)
19323 // Use the EVT of low and high parts before bitcast as the input
19324 // of target query.
19325 EVT LowTy = (Lo.getOperand(0).getOpcode() == ISD::BITCAST)
19326 ? Lo.getOperand(0).getValueType()
19327 : Lo.getValueType();
19328 EVT HighTy = (Hi.getOperand(0).getOpcode() == ISD::BITCAST)
19329 ? Hi.getOperand(0).getValueType()
19330 : Hi.getValueType();
19331 if (!TLI.isMultiStoresCheaperThanBitsMerge(LowTy, HighTy))
19334 // Start to split store.
19335 MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
19336 AAMDNodes AAInfo = ST->getAAInfo();
19338 // Change the sizes of Lo and Hi's value types to HalfValBitSize.
19339 EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
19340 Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
19341 Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
19343 SDValue Chain = ST->getChain();
19344 SDValue Ptr = ST->getBasePtr();
19345 // Lower value store.
19346 SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
19347 ST->getOriginalAlign(), MMOFlags, AAInfo);
19348 Ptr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(HalfValBitSize / 8), DL);
19349 // Higher value store.
19350 SDValue St1 = DAG.getStore(
19351 St0, DL, Hi, Ptr, ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
19352 ST->getOriginalAlign(), MMOFlags, AAInfo);
19356 /// Convert a disguised subvector insertion into a shuffle:
19357 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
19358 assert(N->getOpcode() == ISD::INSERT_VECTOR_ELT &&
19359 "Expected extract_vector_elt");
19360 SDValue InsertVal = N->getOperand(1);
19361 SDValue Vec = N->getOperand(0);
19363 // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N),
19365 // --> (vector_shuffle X, Y) and variations where shuffle operands may be
19367 if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() &&
19368 InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19369 isa<ConstantSDNode>(InsertVal.getOperand(1))) {
19370 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode());
19371 ArrayRef<int> Mask = SVN->getMask();
19373 SDValue X = Vec.getOperand(0);
19374 SDValue Y = Vec.getOperand(1);
19376 // Vec's operand 0 is using indices from 0 to N-1 and
19377 // operand 1 from N to 2N - 1, where N is the number of
19378 // elements in the vectors.
19379 SDValue InsertVal0 = InsertVal.getOperand(0);
19380 int ElementOffset = -1;
19382 // We explore the inputs of the shuffle in order to see if we find the
19383 // source of the extract_vector_elt. If so, we can use it to modify the
19384 // shuffle rather than perform an insert_vector_elt.
19385 SmallVector<std::pair<int, SDValue>, 8> ArgWorkList;
19386 ArgWorkList.emplace_back(Mask.size(), Y);
19387 ArgWorkList.emplace_back(0, X);
19389 while (!ArgWorkList.empty()) {
19392 std::tie(ArgOffset, ArgVal) = ArgWorkList.pop_back_val();
19394 if (ArgVal == InsertVal0) {
19395 ElementOffset = ArgOffset;
19399 // Peek through concat_vector.
19400 if (ArgVal.getOpcode() == ISD::CONCAT_VECTORS) {
19401 int CurrentArgOffset =
19402 ArgOffset + ArgVal.getValueType().getVectorNumElements();
19403 int Step = ArgVal.getOperand(0).getValueType().getVectorNumElements();
19404 for (SDValue Op : reverse(ArgVal->ops())) {
19405 CurrentArgOffset -= Step;
19406 ArgWorkList.emplace_back(CurrentArgOffset, Op);
19409 // Make sure we went through all the elements and did not screw up index
19411 assert(CurrentArgOffset == ArgOffset);
19415 // If we failed to find a match, see if we can replace an UNDEF shuffle
19417 if (ElementOffset == -1 && Y.isUndef() &&
19418 InsertVal0.getValueType() == Y.getValueType()) {
19419 ElementOffset = Mask.size();
19423 if (ElementOffset != -1) {
19424 SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
19426 auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1));
19427 NewMask[InsIndex] = ElementOffset + ExtrIndex->getZExtValue();
19428 assert(NewMask[InsIndex] <
19429 (int)(2 * Vec.getValueType().getVectorNumElements()) &&
19430 NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound");
19432 SDValue LegalShuffle =
19433 TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X,
19436 return LegalShuffle;
19440 // insert_vector_elt V, (bitcast X from vector type), IdxC -->
19441 // bitcast(shuffle (bitcast V), (extended X), Mask)
19442 // Note: We do not use an insert_subvector node because that requires a
19443 // legal subvector type.
19444 if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
19445 !InsertVal.getOperand(0).getValueType().isVector())
19448 SDValue SubVec = InsertVal.getOperand(0);
19449 SDValue DestVec = N->getOperand(0);
19450 EVT SubVecVT = SubVec.getValueType();
19451 EVT VT = DestVec.getValueType();
19452 unsigned NumSrcElts = SubVecVT.getVectorNumElements();
19453 // If the source only has a single vector element, the cost of creating adding
19454 // it to a vector is likely to exceed the cost of a insert_vector_elt.
19455 if (NumSrcElts == 1)
19457 unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
19458 unsigned NumMaskVals = ExtendRatio * NumSrcElts;
19460 // Step 1: Create a shuffle mask that implements this insert operation. The
19461 // vector that we are inserting into will be operand 0 of the shuffle, so
19462 // those elements are just 'i'. The inserted subvector is in the first
19463 // positions of operand 1 of the shuffle. Example:
19464 // insert v4i32 V, (v2i16 X), 2 --> shuffle v8i16 V', X', {0,1,2,3,8,9,6,7}
19465 SmallVector<int, 16> Mask(NumMaskVals);
19466 for (unsigned i = 0; i != NumMaskVals; ++i) {
19467 if (i / NumSrcElts == InsIndex)
19468 Mask[i] = (i % NumSrcElts) + NumMaskVals;
19473 // Bail out if the target can not handle the shuffle we want to create.
19474 EVT SubVecEltVT = SubVecVT.getVectorElementType();
19475 EVT ShufVT = EVT::getVectorVT(*DAG.getContext(), SubVecEltVT, NumMaskVals);
19476 if (!TLI.isShuffleMaskLegal(Mask, ShufVT))
19479 // Step 2: Create a wide vector from the inserted source vector by appending
19480 // undefined elements. This is the same size as our destination vector.
19482 SmallVector<SDValue, 8> ConcatOps(ExtendRatio, DAG.getUNDEF(SubVecVT));
19483 ConcatOps[0] = SubVec;
19484 SDValue PaddedSubV = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShufVT, ConcatOps);
19486 // Step 3: Shuffle in the padded subvector.
19487 SDValue DestVecBC = DAG.getBitcast(ShufVT, DestVec);
19488 SDValue Shuf = DAG.getVectorShuffle(ShufVT, DL, DestVecBC, PaddedSubV, Mask);
19489 AddToWorklist(PaddedSubV.getNode());
19490 AddToWorklist(DestVecBC.getNode());
19491 AddToWorklist(Shuf.getNode());
19492 return DAG.getBitcast(VT, Shuf);
19495 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
19496 SDValue InVec = N->getOperand(0);
19497 SDValue InVal = N->getOperand(1);
19498 SDValue EltNo = N->getOperand(2);
19501 EVT VT = InVec.getValueType();
19502 auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
19504 // Insert into out-of-bounds element is undefined.
19505 if (IndexC && VT.isFixedLengthVector() &&
19506 IndexC->getZExtValue() >= VT.getVectorNumElements())
19507 return DAG.getUNDEF(VT);
19509 // Remove redundant insertions:
19510 // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
19511 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19512 InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
19516 // If this is variable insert to undef vector, it might be better to splat:
19517 // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
19518 if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
19519 if (VT.isScalableVector())
19520 return DAG.getSplatVector(VT, DL, InVal);
19522 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
19523 return DAG.getBuildVector(VT, DL, Ops);
19528 if (VT.isScalableVector())
19531 unsigned NumElts = VT.getVectorNumElements();
19533 // We must know which element is being inserted for folds below here.
19534 unsigned Elt = IndexC->getZExtValue();
19536 if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
19539 // Handle <1 x ???> vector insertion special cases.
19540 if (NumElts == 1) {
19541 // insert_vector_elt(x, extract_vector_elt(y, 0), 0) -> y
19542 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19543 InVal.getOperand(0).getValueType() == VT &&
19544 isNullConstant(InVal.getOperand(1)))
19545 return InVal.getOperand(0);
19548 // Canonicalize insert_vector_elt dag nodes.
19550 // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
19551 // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
19553 // Do this only if the child insert_vector node has one use; also
19554 // do this only if indices are both constants and Idx1 < Idx0.
19555 if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
19556 && isa<ConstantSDNode>(InVec.getOperand(2))) {
19557 unsigned OtherElt = InVec.getConstantOperandVal(2);
19558 if (Elt < OtherElt) {
19560 SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
19561 InVec.getOperand(0), InVal, EltNo);
19562 AddToWorklist(NewOp.getNode());
19563 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
19564 VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
19568 // Attempt to convert an insert_vector_elt chain into a legal build_vector.
19569 if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) {
19570 // vXi1 vector - we don't need to recurse.
19572 return DAG.getBuildVector(VT, DL, {InVal});
19574 // If we haven't already collected the element, insert into the op list.
19575 EVT MaxEltVT = InVal.getValueType();
19576 auto AddBuildVectorOp = [&](SmallVectorImpl<SDValue> &Ops, SDValue Elt,
19580 if (VT.isInteger()) {
19581 EVT EltVT = Elt.getValueType();
19582 MaxEltVT = MaxEltVT.bitsGE(EltVT) ? MaxEltVT : EltVT;
19587 // Ensure all the operands are the same value type, fill any missing
19588 // operands with UNDEF and create the BUILD_VECTOR.
19589 auto CanonicalizeBuildVector = [&](SmallVectorImpl<SDValue> &Ops) {
19590 assert(Ops.size() == NumElts && "Unexpected vector size");
19591 for (SDValue &Op : Ops) {
19593 Op = VT.isInteger() ? DAG.getAnyExtOrTrunc(Op, DL, MaxEltVT) : Op;
19595 Op = DAG.getUNDEF(MaxEltVT);
19597 return DAG.getBuildVector(VT, DL, Ops);
19600 SmallVector<SDValue, 8> Ops(NumElts, SDValue());
19603 // Recurse up a INSERT_VECTOR_ELT chain to build a BUILD_VECTOR.
19604 for (SDValue CurVec = InVec; CurVec;) {
19605 // UNDEF - build new BUILD_VECTOR from already inserted operands.
19606 if (CurVec.isUndef())
19607 return CanonicalizeBuildVector(Ops);
19609 // BUILD_VECTOR - insert unused operands and build new BUILD_VECTOR.
19610 if (CurVec.getOpcode() == ISD::BUILD_VECTOR && CurVec.hasOneUse()) {
19611 for (unsigned I = 0; I != NumElts; ++I)
19612 AddBuildVectorOp(Ops, CurVec.getOperand(I), I);
19613 return CanonicalizeBuildVector(Ops);
19616 // SCALAR_TO_VECTOR - insert unused scalar and build new BUILD_VECTOR.
19617 if (CurVec.getOpcode() == ISD::SCALAR_TO_VECTOR && CurVec.hasOneUse()) {
19618 AddBuildVectorOp(Ops, CurVec.getOperand(0), 0);
19619 return CanonicalizeBuildVector(Ops);
19622 // INSERT_VECTOR_ELT - insert operand and continue up the chain.
19623 if (CurVec.getOpcode() == ISD::INSERT_VECTOR_ELT && CurVec.hasOneUse())
19624 if (auto *CurIdx = dyn_cast<ConstantSDNode>(CurVec.getOperand(2)))
19625 if (CurIdx->getAPIntValue().ult(NumElts)) {
19626 unsigned Idx = CurIdx->getZExtValue();
19627 AddBuildVectorOp(Ops, CurVec.getOperand(1), Idx);
19629 // Found entire BUILD_VECTOR.
19630 if (all_of(Ops, [](SDValue Op) { return !!Op; }))
19631 return CanonicalizeBuildVector(Ops);
19633 CurVec = CurVec->getOperand(0);
19637 // Failed to find a match in the chain - bail.
19641 // See if we can fill in the missing constant elements as zeros.
19642 // TODO: Should we do this for any constant?
19643 APInt DemandedZeroElts = APInt::getZero(NumElts);
19644 for (unsigned I = 0; I != NumElts; ++I)
19646 DemandedZeroElts.setBit(I);
19648 if (DAG.MaskedVectorIsZero(InVec, DemandedZeroElts)) {
19649 SDValue Zero = VT.isInteger() ? DAG.getConstant(0, DL, MaxEltVT)
19650 : DAG.getConstantFP(0, DL, MaxEltVT);
19651 for (unsigned I = 0; I != NumElts; ++I)
19655 return CanonicalizeBuildVector(Ops);
19662 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
19664 LoadSDNode *OriginalLoad) {
19665 assert(OriginalLoad->isSimple());
19667 EVT ResultVT = EVE->getValueType(0);
19668 EVT VecEltVT = InVecVT.getVectorElementType();
19670 // If the vector element type is not a multiple of a byte then we are unable
19671 // to correctly compute an address to load only the extracted element as a
19673 if (!VecEltVT.isByteSized())
19676 ISD::LoadExtType ExtTy =
19677 ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
19678 if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
19679 !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
19682 Align Alignment = OriginalLoad->getAlign();
19683 MachinePointerInfo MPI;
19685 if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
19686 int Elt = ConstEltNo->getZExtValue();
19687 unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
19688 MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
19689 Alignment = commonAlignment(Alignment, PtrOff);
19691 // Discard the pointer info except the address space because the memory
19692 // operand can't represent this new access since the offset is variable.
19693 MPI = MachinePointerInfo(OriginalLoad->getPointerInfo().getAddrSpace());
19694 Alignment = commonAlignment(Alignment, VecEltVT.getSizeInBits() / 8);
19697 bool IsFast = false;
19698 if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VecEltVT,
19699 OriginalLoad->getAddressSpace(), Alignment,
19700 OriginalLoad->getMemOperand()->getFlags(),
19705 SDValue NewPtr = TLI.getVectorElementPointer(DAG, OriginalLoad->getBasePtr(),
19708 // We are replacing a vector load with a scalar load. The new load must have
19709 // identical memory op ordering to the original.
19711 if (ResultVT.bitsGT(VecEltVT)) {
19712 // If the result type of vextract is wider than the load, then issue an
19713 // extending load instead.
19714 ISD::LoadExtType ExtType =
19715 TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT, VecEltVT) ? ISD::ZEXTLOAD
19717 Load = DAG.getExtLoad(ExtType, DL, ResultVT, OriginalLoad->getChain(),
19718 NewPtr, MPI, VecEltVT, Alignment,
19719 OriginalLoad->getMemOperand()->getFlags(),
19720 OriginalLoad->getAAInfo());
19721 DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
19723 // The result type is narrower or the same width as the vector element
19724 Load = DAG.getLoad(VecEltVT, DL, OriginalLoad->getChain(), NewPtr, MPI,
19725 Alignment, OriginalLoad->getMemOperand()->getFlags(),
19726 OriginalLoad->getAAInfo());
19727 DAG.makeEquivalentMemoryOrdering(OriginalLoad, Load);
19728 if (ResultVT.bitsLT(VecEltVT))
19729 Load = DAG.getNode(ISD::TRUNCATE, DL, ResultVT, Load);
19731 Load = DAG.getBitcast(ResultVT, Load);
19737 /// Transform a vector binary operation into a scalar binary operation by moving
19738 /// the math/logic after an extract element of a vector.
19739 static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
19740 bool LegalOperations) {
19741 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19742 SDValue Vec = ExtElt->getOperand(0);
19743 SDValue Index = ExtElt->getOperand(1);
19744 auto *IndexC = dyn_cast<ConstantSDNode>(Index);
19745 if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
19746 Vec->getNumValues() != 1)
19749 // Targets may want to avoid this to prevent an expensive register transfer.
19750 if (!TLI.shouldScalarizeBinop(Vec))
19753 // Extracting an element of a vector constant is constant-folded, so this
19754 // transform is just replacing a vector op with a scalar op while moving the
19756 SDValue Op0 = Vec.getOperand(0);
19757 SDValue Op1 = Vec.getOperand(1);
19759 if (isAnyConstantBuildVector(Op0, true) ||
19760 ISD::isConstantSplatVector(Op0.getNode(), SplatVal) ||
19761 isAnyConstantBuildVector(Op1, true) ||
19762 ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
19763 // extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
19764 // extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
19766 EVT VT = ExtElt->getValueType(0);
19767 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
19768 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
19769 return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
19775 SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
19776 SDValue VecOp = N->getOperand(0);
19777 SDValue Index = N->getOperand(1);
19778 EVT ScalarVT = N->getValueType(0);
19779 EVT VecVT = VecOp.getValueType();
19780 if (VecOp.isUndef())
19781 return DAG.getUNDEF(ScalarVT);
19783 // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
19785 // This only really matters if the index is non-constant since other combines
19786 // on the constant elements already work.
19788 if (VecOp.getOpcode() == ISD::INSERT_VECTOR_ELT &&
19789 Index == VecOp.getOperand(2)) {
19790 SDValue Elt = VecOp.getOperand(1);
19791 return VecVT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, DL, ScalarVT) : Elt;
19794 // (vextract (scalar_to_vector val, 0) -> val
19795 if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {
19796 // Only 0'th element of SCALAR_TO_VECTOR is defined.
19797 if (DAG.isKnownNeverZero(Index))
19798 return DAG.getUNDEF(ScalarVT);
19800 // Check if the result type doesn't match the inserted element type. A
19801 // SCALAR_TO_VECTOR may truncate the inserted element and the
19802 // EXTRACT_VECTOR_ELT may widen the extracted vector.
19803 SDValue InOp = VecOp.getOperand(0);
19804 if (InOp.getValueType() != ScalarVT) {
19805 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger() &&
19806 InOp.getValueType().bitsGT(ScalarVT));
19807 return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, InOp);
19812 // extract_vector_elt of out-of-bounds element -> UNDEF
19813 auto *IndexC = dyn_cast<ConstantSDNode>(Index);
19814 if (IndexC && VecVT.isFixedLengthVector() &&
19815 IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
19816 return DAG.getUNDEF(ScalarVT);
19818 // extract_vector_elt (build_vector x, y), 1 -> y
19819 if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
19820 VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
19821 TLI.isTypeLegal(VecVT) &&
19822 (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
19823 assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
19824 VecVT.isFixedLengthVector()) &&
19825 "BUILD_VECTOR used for scalable vectors");
19826 unsigned IndexVal =
19827 VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
19828 SDValue Elt = VecOp.getOperand(IndexVal);
19829 EVT InEltVT = Elt.getValueType();
19831 // Sometimes build_vector's scalar input types do not match result type.
19832 if (ScalarVT == InEltVT)
19835 // TODO: It may be useful to truncate if free if the build_vector implicitly
19839 if (SDValue BO = scalarizeExtractedBinop(N, DAG, LegalOperations))
19842 if (VecVT.isScalableVector())
19845 // All the code from this point onwards assumes fixed width vectors, but it's
19846 // possible that some of the combinations could be made to work for scalable
19848 unsigned NumElts = VecVT.getVectorNumElements();
19849 unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
19851 // TODO: These transforms should not require the 'hasOneUse' restriction, but
19852 // there are regressions on multiple targets without it. We can end up with a
19853 // mess of scalar and vector code if we reduce only part of the DAG to scalar.
19854 if (IndexC && VecOp.getOpcode() == ISD::BITCAST && VecVT.isInteger() &&
19855 VecOp.hasOneUse()) {
19856 // The vector index of the LSBs of the source depend on the endian-ness.
19857 bool IsLE = DAG.getDataLayout().isLittleEndian();
19858 unsigned ExtractIndex = IndexC->getZExtValue();
19859 // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
19860 unsigned BCTruncElt = IsLE ? 0 : NumElts - 1;
19861 SDValue BCSrc = VecOp.getOperand(0);
19862 if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
19863 return DAG.getNode(ISD::TRUNCATE, DL, ScalarVT, BCSrc);
19865 if (LegalTypes && BCSrc.getValueType().isInteger() &&
19866 BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
19867 // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
19868 // trunc i64 X to i32
19869 SDValue X = BCSrc.getOperand(0);
19870 assert(X.getValueType().isScalarInteger() && ScalarVT.isScalarInteger() &&
19871 "Extract element and scalar to vector can't change element type "
19872 "from FP to integer.");
19873 unsigned XBitWidth = X.getValueSizeInBits();
19874 BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
19876 // An extract element return value type can be wider than its vector
19877 // operand element type. In that case, the high bits are undefined, so
19878 // it's possible that we may need to extend rather than truncate.
19879 if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth) {
19880 assert(XBitWidth % VecEltBitWidth == 0 &&
19881 "Scalar bitwidth must be a multiple of vector element bitwidth");
19882 return DAG.getAnyExtOrTrunc(X, DL, ScalarVT);
19887 // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
19888 // We only perform this optimization before the op legalization phase because
19889 // we may introduce new vector instructions which are not backed by TD
19890 // patterns. For example on AVX, extracting elements from a wide vector
19891 // without using extract_subvector. However, if we can find an underlying
19892 // scalar value, then we can always use that.
19893 if (IndexC && VecOp.getOpcode() == ISD::VECTOR_SHUFFLE) {
19894 auto *Shuf = cast<ShuffleVectorSDNode>(VecOp);
19895 // Find the new index to extract from.
19896 int OrigElt = Shuf->getMaskElt(IndexC->getZExtValue());
19898 // Extracting an undef index is undef.
19900 return DAG.getUNDEF(ScalarVT);
19902 // Select the right vector half to extract from.
19904 if (OrigElt < (int)NumElts) {
19905 SVInVec = VecOp.getOperand(0);
19907 SVInVec = VecOp.getOperand(1);
19908 OrigElt -= NumElts;
19911 if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
19912 SDValue InOp = SVInVec.getOperand(OrigElt);
19913 if (InOp.getValueType() != ScalarVT) {
19914 assert(InOp.getValueType().isInteger() && ScalarVT.isInteger());
19915 InOp = DAG.getSExtOrTrunc(InOp, DL, ScalarVT);
19921 // FIXME: We should handle recursing on other vector shuffles and
19922 // scalar_to_vector here as well.
19924 if (!LegalOperations ||
19925 // FIXME: Should really be just isOperationLegalOrCustom.
19926 TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
19927 TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
19928 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
19929 DAG.getVectorIdxConstant(OrigElt, DL));
19933 // If only EXTRACT_VECTOR_ELT nodes use the source vector we can
19934 // simplify it based on the (valid) extraction indices.
19935 if (llvm::all_of(VecOp->uses(), [&](SDNode *Use) {
19936 return Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
19937 Use->getOperand(0) == VecOp &&
19938 isa<ConstantSDNode>(Use->getOperand(1));
19940 APInt DemandedElts = APInt::getZero(NumElts);
19941 for (SDNode *Use : VecOp->uses()) {
19942 auto *CstElt = cast<ConstantSDNode>(Use->getOperand(1));
19943 if (CstElt->getAPIntValue().ult(NumElts))
19944 DemandedElts.setBit(CstElt->getZExtValue());
19946 if (SimplifyDemandedVectorElts(VecOp, DemandedElts, true)) {
19947 // We simplified the vector operand of this extract element. If this
19948 // extract is not dead, visit it again so it is folded properly.
19949 if (N->getOpcode() != ISD::DELETED_NODE)
19951 return SDValue(N, 0);
19953 APInt DemandedBits = APInt::getAllOnes(VecEltBitWidth);
19954 if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
19955 // We simplified the vector operand of this extract element. If this
19956 // extract is not dead, visit it again so it is folded properly.
19957 if (N->getOpcode() != ISD::DELETED_NODE)
19959 return SDValue(N, 0);
19963 // Everything under here is trying to match an extract of a loaded value.
19964 // If the result of load has to be truncated, then it's not necessarily
19966 bool BCNumEltsChanged = false;
19967 EVT ExtVT = VecVT.getVectorElementType();
19969 if (ScalarVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, ScalarVT))
19972 if (VecOp.getOpcode() == ISD::BITCAST) {
19973 // Don't duplicate a load with other uses.
19974 if (!VecOp.hasOneUse())
19977 EVT BCVT = VecOp.getOperand(0).getValueType();
19978 if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
19980 if (NumElts != BCVT.getVectorNumElements())
19981 BCNumEltsChanged = true;
19982 VecOp = VecOp.getOperand(0);
19983 ExtVT = BCVT.getVectorElementType();
19986 // extract (vector load $addr), i --> load $addr + i * size
19987 if (!LegalOperations && !IndexC && VecOp.hasOneUse() &&
19988 ISD::isNormalLoad(VecOp.getNode()) &&
19989 !Index->hasPredecessor(VecOp.getNode())) {
19990 auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
19991 if (VecLoad && VecLoad->isSimple())
19992 return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
19995 // Perform only after legalization to ensure build_vector / vector_shuffle
19996 // optimizations have already been done.
19997 if (!LegalOperations || !IndexC)
20000 // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
20001 // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
20002 // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
20003 int Elt = IndexC->getZExtValue();
20004 LoadSDNode *LN0 = nullptr;
20005 if (ISD::isNormalLoad(VecOp.getNode())) {
20006 LN0 = cast<LoadSDNode>(VecOp);
20007 } else if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
20008 VecOp.getOperand(0).getValueType() == ExtVT &&
20009 ISD::isNormalLoad(VecOp.getOperand(0).getNode())) {
20010 // Don't duplicate a load with other uses.
20011 if (!VecOp.hasOneUse())
20014 LN0 = cast<LoadSDNode>(VecOp.getOperand(0));
20016 if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(VecOp)) {
20017 // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
20019 // (load $addr+1*size)
20021 // Don't duplicate a load with other uses.
20022 if (!VecOp.hasOneUse())
20025 // If the bit convert changed the number of elements, it is unsafe
20026 // to examine the mask.
20027 if (BCNumEltsChanged)
20030 // Select the input vector, guarding against out of range extract vector.
20031 int Idx = (Elt > (int)NumElts) ? -1 : Shuf->getMaskElt(Elt);
20032 VecOp = (Idx < (int)NumElts) ? VecOp.getOperand(0) : VecOp.getOperand(1);
20034 if (VecOp.getOpcode() == ISD::BITCAST) {
20035 // Don't duplicate a load with other uses.
20036 if (!VecOp.hasOneUse())
20039 VecOp = VecOp.getOperand(0);
20041 if (ISD::isNormalLoad(VecOp.getNode())) {
20042 LN0 = cast<LoadSDNode>(VecOp);
20043 Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;
20044 Index = DAG.getConstant(Elt, DL, Index.getValueType());
20046 } else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged &&
20047 VecVT.getVectorElementType() == ScalarVT &&
20050 VecOp.getOperand(0).getValueType().getVectorElementType()))) {
20051 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0
20052 // -> extract_vector_elt a, 0
20053 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1
20054 // -> extract_vector_elt a, 1
20055 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2
20056 // -> extract_vector_elt b, 0
20057 // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
20058 // -> extract_vector_elt b, 1
20060 EVT ConcatVT = VecOp.getOperand(0).getValueType();
20061 unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
20062 SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
20063 Index.getValueType());
20065 SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
20066 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
20067 ConcatVT.getVectorElementType(),
20069 return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
20072 // Make sure we found a non-volatile load and the extractelement is
20074 if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple())
20077 // If Idx was -1 above, Elt is going to be -1, so just return undef.
20079 return DAG.getUNDEF(LVT);
20081 return scalarizeExtractedVectorLoad(N, VecVT, Index, LN0);
20084 // Simplify (build_vec (ext )) to (bitcast (build_vec ))
20085 SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
20086 // We perform this optimization post type-legalization because
20087 // the type-legalizer often scalarizes integer-promoted vectors.
20088 // Performing this optimization before may create bit-casts which
20089 // will be type-legalized to complex code sequences.
20090 // We perform this optimization only before the operation legalizer because we
20091 // may introduce illegal operations.
20092 if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
20095 unsigned NumInScalars = N->getNumOperands();
20097 EVT VT = N->getValueType(0);
20099 // Check to see if this is a BUILD_VECTOR of a bunch of values
20100 // which come from any_extend or zero_extend nodes. If so, we can create
20101 // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
20102 // optimizations. We do not handle sign-extend because we can't fill the sign
20104 EVT SourceType = MVT::Other;
20105 bool AllAnyExt = true;
20107 for (unsigned i = 0; i != NumInScalars; ++i) {
20108 SDValue In = N->getOperand(i);
20109 // Ignore undef inputs.
20110 if (In.isUndef()) continue;
20112 bool AnyExt = In.getOpcode() == ISD::ANY_EXTEND;
20113 bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
20115 // Abort if the element is not an extension.
20116 if (!ZeroExt && !AnyExt) {
20117 SourceType = MVT::Other;
20121 // The input is a ZeroExt or AnyExt. Check the original type.
20122 EVT InTy = In.getOperand(0).getValueType();
20124 // Check that all of the widened source types are the same.
20125 if (SourceType == MVT::Other)
20128 else if (InTy != SourceType) {
20129 // Multiple income types. Abort.
20130 SourceType = MVT::Other;
20134 // Check if all of the extends are ANY_EXTENDs.
20135 AllAnyExt &= AnyExt;
20138 // In order to have valid types, all of the inputs must be extended from the
20139 // same source type and all of the inputs must be any or zero extend.
20140 // Scalar sizes must be a power of two.
20141 EVT OutScalarTy = VT.getScalarType();
20142 bool ValidTypes = SourceType != MVT::Other &&
20143 isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
20144 isPowerOf2_32(SourceType.getSizeInBits());
20146 // Create a new simpler BUILD_VECTOR sequence which other optimizations can
20147 // turn into a single shuffle instruction.
20151 // If we already have a splat buildvector, then don't fold it if it means
20152 // introducing zeros.
20153 if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true))
20156 bool isLE = DAG.getDataLayout().isLittleEndian();
20157 unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
20158 assert(ElemRatio > 1 && "Invalid element size ratio");
20159 SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
20160 DAG.getConstant(0, DL, SourceType);
20162 unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
20163 SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
20165 // Populate the new build_vector
20166 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20167 SDValue Cast = N->getOperand(i);
20168 assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
20169 Cast.getOpcode() == ISD::ZERO_EXTEND ||
20170 Cast.isUndef()) && "Invalid cast opcode");
20172 if (Cast.isUndef())
20173 In = DAG.getUNDEF(SourceType);
20175 In = Cast->getOperand(0);
20176 unsigned Index = isLE ? (i * ElemRatio) :
20177 (i * ElemRatio + (ElemRatio - 1));
20179 assert(Index < Ops.size() && "Invalid index");
20183 // The type of the new BUILD_VECTOR node.
20184 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
20185 assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
20186 "Invalid vector size");
20187 // Check if the new vector type is legal.
20188 if (!isTypeLegal(VecVT) ||
20189 (!TLI.isOperationLegal(ISD::BUILD_VECTOR, VecVT) &&
20190 TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)))
20193 // Make the new BUILD_VECTOR.
20194 SDValue BV = DAG.getBuildVector(VecVT, DL, Ops);
20196 // The new BUILD_VECTOR node has the potential to be further optimized.
20197 AddToWorklist(BV.getNode());
20198 // Bitcast to the desired type.
20199 return DAG.getBitcast(VT, BV);
20202 // Simplify (build_vec (trunc $1)
20203 // (trunc (srl $1 half-width))
20204 // (trunc (srl $1 (2 * half-width))) …)
20206 SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
20207 assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
20209 // Only for little endian
20210 if (!DAG.getDataLayout().isLittleEndian())
20214 EVT VT = N->getValueType(0);
20215 EVT OutScalarTy = VT.getScalarType();
20216 uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits();
20218 // Only for power of two types to be sure that bitcast works well
20219 if (!isPowerOf2_64(ScalarTypeBitsize))
20222 unsigned NumInScalars = N->getNumOperands();
20224 // Look through bitcasts
20225 auto PeekThroughBitcast = [](SDValue Op) {
20226 if (Op.getOpcode() == ISD::BITCAST)
20227 return Op.getOperand(0);
20231 // The source value where all the parts are extracted.
20233 for (unsigned i = 0; i != NumInScalars; ++i) {
20234 SDValue In = PeekThroughBitcast(N->getOperand(i));
20235 // Ignore undef inputs.
20236 if (In.isUndef()) continue;
20238 if (In.getOpcode() != ISD::TRUNCATE)
20241 In = PeekThroughBitcast(In.getOperand(0));
20243 if (In.getOpcode() != ISD::SRL) {
20244 // For now only build_vec without shuffling, handle shifts here in the
20252 SDValue part = PeekThroughBitcast(In.getOperand(0));
20256 } else if (Src != part) {
20257 // Vector parts do not stem from the same variable
20261 SDValue ShiftAmtVal = In.getOperand(1);
20262 if (!isa<ConstantSDNode>(ShiftAmtVal))
20265 uint64_t ShiftAmt = In.getConstantOperandVal(1);
20267 // The extracted value is not extracted at the right position
20268 if (ShiftAmt != i * ScalarTypeBitsize)
20273 // Only cast if the size is the same
20274 if (Src.getValueType().getSizeInBits() != VT.getSizeInBits())
20277 return DAG.getBitcast(VT, Src);
20280 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
20281 ArrayRef<int> VectorMask,
20282 SDValue VecIn1, SDValue VecIn2,
20283 unsigned LeftIdx, bool DidSplitVec) {
20284 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
20286 EVT VT = N->getValueType(0);
20287 EVT InVT1 = VecIn1.getValueType();
20288 EVT InVT2 = VecIn2.getNode() ? VecIn2.getValueType() : InVT1;
20290 unsigned NumElems = VT.getVectorNumElements();
20291 unsigned ShuffleNumElems = NumElems;
20293 // If we artificially split a vector in two already, then the offsets in the
20294 // operands will all be based off of VecIn1, even those in VecIn2.
20295 unsigned Vec2Offset = DidSplitVec ? 0 : InVT1.getVectorNumElements();
20297 uint64_t VTSize = VT.getFixedSizeInBits();
20298 uint64_t InVT1Size = InVT1.getFixedSizeInBits();
20299 uint64_t InVT2Size = InVT2.getFixedSizeInBits();
20301 assert(InVT2Size <= InVT1Size &&
20302 "Inputs must be sorted to be in non-increasing vector size order.");
20304 // We can't generate a shuffle node with mismatched input and output types.
20305 // Try to make the types match the type of the output.
20306 if (InVT1 != VT || InVT2 != VT) {
20307 if ((VTSize % InVT1Size == 0) && InVT1 == InVT2) {
20308 // If the output vector length is a multiple of both input lengths,
20309 // we can concatenate them and pad the rest with undefs.
20310 unsigned NumConcats = VTSize / InVT1Size;
20311 assert(NumConcats >= 2 && "Concat needs at least two inputs!");
20312 SmallVector<SDValue, 2> ConcatOps(NumConcats, DAG.getUNDEF(InVT1));
20313 ConcatOps[0] = VecIn1;
20314 ConcatOps[1] = VecIn2 ? VecIn2 : DAG.getUNDEF(InVT1);
20315 VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
20316 VecIn2 = SDValue();
20317 } else if (InVT1Size == VTSize * 2) {
20318 if (!TLI.isExtractSubvectorCheap(VT, InVT1, NumElems))
20321 if (!VecIn2.getNode()) {
20322 // If we only have one input vector, and it's twice the size of the
20323 // output, split it in two.
20324 VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
20325 DAG.getVectorIdxConstant(NumElems, DL));
20326 VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
20327 // Since we now have shorter input vectors, adjust the offset of the
20328 // second vector's start.
20329 Vec2Offset = NumElems;
20331 assert(InVT2Size <= InVT1Size &&
20332 "Second input is not going to be larger than the first one.");
20334 // VecIn1 is wider than the output, and we have another, possibly
20335 // smaller input. Pad the smaller input with undefs, shuffle at the
20336 // input vector width, and extract the output.
20337 // The shuffle type is different than VT, so check legality again.
20338 if (LegalOperations &&
20339 !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
20342 // Legalizing INSERT_SUBVECTOR is tricky - you basically have to
20343 // lower it back into a BUILD_VECTOR. So if the inserted type is
20344 // illegal, don't even try.
20345 if (InVT1 != InVT2) {
20346 if (!TLI.isTypeLegal(InVT2))
20348 VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
20349 DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
20351 ShuffleNumElems = NumElems * 2;
20353 } else if (InVT2Size * 2 == VTSize && InVT1Size == VTSize) {
20354 SmallVector<SDValue, 2> ConcatOps(2, DAG.getUNDEF(InVT2));
20355 ConcatOps[0] = VecIn2;
20356 VecIn2 = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
20358 // TODO: Support cases where the length mismatch isn't exactly by a
20360 // TODO: Move this check upwards, so that if we have bad type
20361 // mismatches, we don't create any DAG nodes.
20366 // Initialize mask to undef.
20367 SmallVector<int, 8> Mask(ShuffleNumElems, -1);
20369 // Only need to run up to the number of elements actually used, not the
20370 // total number of elements in the shuffle - if we are shuffling a wider
20371 // vector, the high lanes should be set to undef.
20372 for (unsigned i = 0; i != NumElems; ++i) {
20373 if (VectorMask[i] <= 0)
20376 unsigned ExtIndex = N->getOperand(i).getConstantOperandVal(1);
20377 if (VectorMask[i] == (int)LeftIdx) {
20378 Mask[i] = ExtIndex;
20379 } else if (VectorMask[i] == (int)LeftIdx + 1) {
20380 Mask[i] = Vec2Offset + ExtIndex;
20384 // The type the input vectors may have changed above.
20385 InVT1 = VecIn1.getValueType();
20387 // If we already have a VecIn2, it should have the same type as VecIn1.
20388 // If we don't, get an undef/zero vector of the appropriate type.
20389 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(InVT1);
20390 assert(InVT1 == VecIn2.getValueType() && "Unexpected second input type.");
20392 SDValue Shuffle = DAG.getVectorShuffle(InVT1, DL, VecIn1, VecIn2, Mask);
20393 if (ShuffleNumElems > NumElems)
20394 Shuffle = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuffle, ZeroIdx);
20399 static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
20400 assert(BV->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
20402 // First, determine where the build vector is not undef.
20403 // TODO: We could extend this to handle zero elements as well as undefs.
20404 int NumBVOps = BV->getNumOperands();
20406 for (int i = 0; i != NumBVOps; ++i) {
20407 SDValue Op = BV->getOperand(i);
20415 // Bail out if there's no non-undef element.
20419 // The build vector contains some number of undef elements and exactly
20420 // one other element. That other element must be a zero-extended scalar
20421 // extracted from a vector at a constant index to turn this into a shuffle.
20422 // Also, require that the build vector does not implicitly truncate/extend
20424 // TODO: This could be enhanced to allow ANY_EXTEND as well as ZERO_EXTEND.
20425 EVT VT = BV->getValueType(0);
20426 SDValue Zext = BV->getOperand(ZextElt);
20427 if (Zext.getOpcode() != ISD::ZERO_EXTEND || !Zext.hasOneUse() ||
20428 Zext.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20429 !isa<ConstantSDNode>(Zext.getOperand(0).getOperand(1)) ||
20430 Zext.getValueSizeInBits() != VT.getScalarSizeInBits())
20433 // The zero-extend must be a multiple of the source size, and we must be
20434 // building a vector of the same size as the source of the extract element.
20435 SDValue Extract = Zext.getOperand(0);
20436 unsigned DestSize = Zext.getValueSizeInBits();
20437 unsigned SrcSize = Extract.getValueSizeInBits();
20438 if (DestSize % SrcSize != 0 ||
20439 Extract.getOperand(0).getValueSizeInBits() != VT.getSizeInBits())
20442 // Create a shuffle mask that will combine the extracted element with zeros
20444 int ZextRatio = DestSize / SrcSize;
20445 int NumMaskElts = NumBVOps * ZextRatio;
20446 SmallVector<int, 32> ShufMask(NumMaskElts, -1);
20447 for (int i = 0; i != NumMaskElts; ++i) {
20448 if (i / ZextRatio == ZextElt) {
20449 // The low bits of the (potentially translated) extracted element map to
20450 // the source vector. The high bits map to zero. We will use a zero vector
20451 // as the 2nd source operand of the shuffle, so use the 1st element of
20452 // that vector (mask value is number-of-elements) for the high bits.
20453 if (i % ZextRatio == 0)
20454 ShufMask[i] = Extract.getConstantOperandVal(1);
20456 ShufMask[i] = NumMaskElts;
20459 // Undef elements of the build vector remain undef because we initialize
20460 // the shuffle mask with -1.
20463 // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
20464 // bitcast (shuffle V, ZeroVec, VectorMask)
20466 EVT VecVT = Extract.getOperand(0).getValueType();
20467 SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
20468 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20469 SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0),
20470 ZeroVec, ShufMask, DAG);
20473 return DAG.getBitcast(VT, Shuf);
20476 // FIXME: promote to STLExtras.
20477 template <typename R, typename T>
20478 static auto getFirstIndexOf(R &&Range, const T &Val) {
20479 auto I = find(Range, Val);
20480 if (I == Range.end())
20481 return static_cast<decltype(std::distance(Range.begin(), I))>(-1);
20482 return std::distance(Range.begin(), I);
20485 // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
20486 // operations. If the types of the vectors we're extracting from allow it,
20487 // turn this into a vector_shuffle node.
20488 SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
20490 EVT VT = N->getValueType(0);
20492 // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
20493 if (!isTypeLegal(VT))
20496 if (SDValue V = reduceBuildVecToShuffleWithZero(N, DAG))
20499 // May only combine to shuffle after legalize if shuffle is legal.
20500 if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
20503 bool UsesZeroVector = false;
20504 unsigned NumElems = N->getNumOperands();
20506 // Record, for each element of the newly built vector, which input vector
20507 // that element comes from. -1 stands for undef, 0 for the zero vector,
20508 // and positive values for the input vectors.
20509 // VectorMask maps each element to its vector number, and VecIn maps vector
20510 // numbers to their initial SDValues.
20512 SmallVector<int, 8> VectorMask(NumElems, -1);
20513 SmallVector<SDValue, 8> VecIn;
20514 VecIn.push_back(SDValue());
20516 for (unsigned i = 0; i != NumElems; ++i) {
20517 SDValue Op = N->getOperand(i);
20522 // See if we can use a blend with a zero vector.
20523 // TODO: Should we generalize this to a blend with an arbitrary constant
20525 if (isNullConstant(Op) || isNullFPConstant(Op)) {
20526 UsesZeroVector = true;
20531 // Not an undef or zero. If the input is something other than an
20532 // EXTRACT_VECTOR_ELT with an in-range constant index, bail out.
20533 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
20534 !isa<ConstantSDNode>(Op.getOperand(1)))
20536 SDValue ExtractedFromVec = Op.getOperand(0);
20538 if (ExtractedFromVec.getValueType().isScalableVector())
20541 const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
20542 if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
20545 // All inputs must have the same element type as the output.
20546 if (VT.getVectorElementType() !=
20547 ExtractedFromVec.getValueType().getVectorElementType())
20550 // Have we seen this input vector before?
20551 // The vectors are expected to be tiny (usually 1 or 2 elements), so using
20552 // a map back from SDValues to numbers isn't worth it.
20553 int Idx = getFirstIndexOf(VecIn, ExtractedFromVec);
20554 if (Idx == -1) { // A new source vector?
20555 Idx = VecIn.size();
20556 VecIn.push_back(ExtractedFromVec);
20559 VectorMask[i] = Idx;
20562 // If we didn't find at least one input vector, bail out.
20563 if (VecIn.size() < 2)
20566 // If all the Operands of BUILD_VECTOR extract from same
20567 // vector, then split the vector efficiently based on the maximum
20568 // vector access index and adjust the VectorMask and
20569 // VecIn accordingly.
20570 bool DidSplitVec = false;
20571 if (VecIn.size() == 2) {
20572 unsigned MaxIndex = 0;
20573 unsigned NearestPow2 = 0;
20574 SDValue Vec = VecIn.back();
20575 EVT InVT = Vec.getValueType();
20576 SmallVector<unsigned, 8> IndexVec(NumElems, 0);
20578 for (unsigned i = 0; i < NumElems; i++) {
20579 if (VectorMask[i] <= 0)
20581 unsigned Index = N->getOperand(i).getConstantOperandVal(1);
20582 IndexVec[i] = Index;
20583 MaxIndex = std::max(MaxIndex, Index);
20586 NearestPow2 = PowerOf2Ceil(MaxIndex);
20587 if (InVT.isSimple() && NearestPow2 > 2 && MaxIndex < NearestPow2 &&
20588 NumElems * 2 < NearestPow2) {
20589 unsigned SplitSize = NearestPow2 / 2;
20590 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(),
20591 InVT.getVectorElementType(), SplitSize);
20592 if (TLI.isTypeLegal(SplitVT) &&
20593 SplitSize + SplitVT.getVectorNumElements() <=
20594 InVT.getVectorNumElements()) {
20595 SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
20596 DAG.getVectorIdxConstant(SplitSize, DL));
20597 SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
20598 DAG.getVectorIdxConstant(0, DL));
20600 VecIn.push_back(VecIn1);
20601 VecIn.push_back(VecIn2);
20602 DidSplitVec = true;
20604 for (unsigned i = 0; i < NumElems; i++) {
20605 if (VectorMask[i] <= 0)
20607 VectorMask[i] = (IndexVec[i] < SplitSize) ? 1 : 2;
20613 // Sort input vectors by decreasing vector element count,
20614 // while preserving the relative order of equally-sized vectors.
20615 // Note that we keep the first "implicit zero vector as-is.
20616 SmallVector<SDValue, 8> SortedVecIn(VecIn);
20617 llvm::stable_sort(MutableArrayRef<SDValue>(SortedVecIn).drop_front(),
20618 [](const SDValue &a, const SDValue &b) {
20619 return a.getValueType().getVectorNumElements() >
20620 b.getValueType().getVectorNumElements();
20623 // We now also need to rebuild the VectorMask, because it referenced element
20624 // order in VecIn, and we just sorted them.
20625 for (int &SourceVectorIndex : VectorMask) {
20626 if (SourceVectorIndex <= 0)
20628 unsigned Idx = getFirstIndexOf(SortedVecIn, VecIn[SourceVectorIndex]);
20629 assert(Idx > 0 && Idx < SortedVecIn.size() &&
20630 VecIn[SourceVectorIndex] == SortedVecIn[Idx] && "Remapping failure");
20631 SourceVectorIndex = Idx;
20634 VecIn = std::move(SortedVecIn);
20636 // TODO: Should this fire if some of the input vectors has illegal type (like
20637 // it does now), or should we let legalization run its course first?
20640 // Take pairs of vectors, and shuffle them so that the result has elements
20641 // from these vectors in the correct places.
20642 // For example, given:
20643 // t10: i32 = extract_vector_elt t1, Constant:i64<0>
20644 // t11: i32 = extract_vector_elt t2, Constant:i64<0>
20645 // t12: i32 = extract_vector_elt t3, Constant:i64<0>
20646 // t13: i32 = extract_vector_elt t1, Constant:i64<1>
20647 // t14: v4i32 = BUILD_VECTOR t10, t11, t12, t13
20648 // We will generate:
20649 // t20: v4i32 = vector_shuffle<0,4,u,1> t1, t2
20650 // t21: v4i32 = vector_shuffle<u,u,0,u> t3, undef
20651 SmallVector<SDValue, 4> Shuffles;
20652 for (unsigned In = 0, Len = (VecIn.size() / 2); In < Len; ++In) {
20653 unsigned LeftIdx = 2 * In + 1;
20654 SDValue VecLeft = VecIn[LeftIdx];
20656 (LeftIdx + 1) < VecIn.size() ? VecIn[LeftIdx + 1] : SDValue();
20658 if (SDValue Shuffle = createBuildVecShuffle(DL, N, VectorMask, VecLeft,
20659 VecRight, LeftIdx, DidSplitVec))
20660 Shuffles.push_back(Shuffle);
20665 // If we need the zero vector as an "ingredient" in the blend tree, add it
20666 // to the list of shuffles.
20667 if (UsesZeroVector)
20668 Shuffles.push_back(VT.isInteger() ? DAG.getConstant(0, DL, VT)
20669 : DAG.getConstantFP(0.0, DL, VT));
20671 // If we only have one shuffle, we're done.
20672 if (Shuffles.size() == 1)
20673 return Shuffles[0];
20675 // Update the vector mask to point to the post-shuffle vectors.
20676 for (int &Vec : VectorMask)
20678 Vec = Shuffles.size() - 1;
20680 Vec = (Vec - 1) / 2;
20682 // More than one shuffle. Generate a binary tree of blends, e.g. if from
20683 // the previous step we got the set of shuffles t10, t11, t12, t13, we will
20685 // t10: v8i32 = vector_shuffle<0,8,u,u,u,u,u,u> t1, t2
20686 // t11: v8i32 = vector_shuffle<u,u,0,8,u,u,u,u> t3, t4
20687 // t12: v8i32 = vector_shuffle<u,u,u,u,0,8,u,u> t5, t6
20688 // t13: v8i32 = vector_shuffle<u,u,u,u,u,u,0,8> t7, t8
20689 // t20: v8i32 = vector_shuffle<0,1,10,11,u,u,u,u> t10, t11
20690 // t21: v8i32 = vector_shuffle<u,u,u,u,4,5,14,15> t12, t13
20691 // t30: v8i32 = vector_shuffle<0,1,2,3,12,13,14,15> t20, t21
20693 // Make sure the initial size of the shuffle list is even.
20694 if (Shuffles.size() % 2)
20695 Shuffles.push_back(DAG.getUNDEF(VT));
20697 for (unsigned CurSize = Shuffles.size(); CurSize > 1; CurSize /= 2) {
20699 Shuffles[CurSize] = DAG.getUNDEF(VT);
20702 for (unsigned In = 0, Len = CurSize / 2; In < Len; ++In) {
20704 int Right = 2 * In + 1;
20705 SmallVector<int, 8> Mask(NumElems, -1);
20706 SDValue L = Shuffles[Left];
20707 ArrayRef<int> LMask;
20708 bool IsLeftShuffle = L.getOpcode() == ISD::VECTOR_SHUFFLE &&
20709 L.use_empty() && L.getOperand(1).isUndef() &&
20710 L.getOperand(0).getValueType() == L.getValueType();
20711 if (IsLeftShuffle) {
20712 LMask = cast<ShuffleVectorSDNode>(L.getNode())->getMask();
20713 L = L.getOperand(0);
20715 SDValue R = Shuffles[Right];
20716 ArrayRef<int> RMask;
20717 bool IsRightShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE &&
20718 R.use_empty() && R.getOperand(1).isUndef() &&
20719 R.getOperand(0).getValueType() == R.getValueType();
20720 if (IsRightShuffle) {
20721 RMask = cast<ShuffleVectorSDNode>(R.getNode())->getMask();
20722 R = R.getOperand(0);
20724 for (unsigned I = 0; I != NumElems; ++I) {
20725 if (VectorMask[I] == Left) {
20728 Mask[I] = LMask[I];
20729 VectorMask[I] = In;
20730 } else if (VectorMask[I] == Right) {
20731 Mask[I] = I + NumElems;
20732 if (IsRightShuffle)
20733 Mask[I] = RMask[I] + NumElems;
20734 VectorMask[I] = In;
20738 Shuffles[In] = DAG.getVectorShuffle(VT, DL, L, R, Mask);
20741 return Shuffles[0];
20744 // Try to turn a build vector of zero extends of extract vector elts into a
20745 // a vector zero extend and possibly an extract subvector.
20746 // TODO: Support sign extend?
20747 // TODO: Allow undef elements?
20748 SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
20749 if (LegalOperations)
20752 EVT VT = N->getValueType(0);
20754 bool FoundZeroExtend = false;
20755 SDValue Op0 = N->getOperand(0);
20756 auto checkElem = [&](SDValue Op) -> int64_t {
20757 unsigned Opc = Op.getOpcode();
20758 FoundZeroExtend |= (Opc == ISD::ZERO_EXTEND);
20759 if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND) &&
20760 Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
20761 Op0.getOperand(0).getOperand(0) == Op.getOperand(0).getOperand(0))
20762 if (auto *C = dyn_cast<ConstantSDNode>(Op.getOperand(0).getOperand(1)))
20763 return C->getZExtValue();
20767 // Make sure the first element matches
20768 // (zext (extract_vector_elt X, C))
20769 // Offset must be a constant multiple of the
20770 // known-minimum vector length of the result type.
20771 int64_t Offset = checkElem(Op0);
20772 if (Offset < 0 || (Offset % VT.getVectorNumElements()) != 0)
20775 unsigned NumElems = N->getNumOperands();
20776 SDValue In = Op0.getOperand(0).getOperand(0);
20777 EVT InSVT = In.getValueType().getScalarType();
20778 EVT InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumElems);
20780 // Don't create an illegal input type after type legalization.
20781 if (LegalTypes && !TLI.isTypeLegal(InVT))
20784 // Ensure all the elements come from the same vector and are adjacent.
20785 for (unsigned i = 1; i != NumElems; ++i) {
20786 if ((Offset + i) != checkElem(N->getOperand(i)))
20791 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InVT, In,
20792 Op0.getOperand(0).getOperand(1));
20793 return DAG.getNode(FoundZeroExtend ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, DL,
20797 SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
20798 EVT VT = N->getValueType(0);
20800 // A vector built entirely of undefs is undef.
20801 if (ISD::allOperandsUndef(N))
20802 return DAG.getUNDEF(VT);
20804 // If this is a splat of a bitcast from another vector, change to a
20807 // (build_vector (i64 (bitcast (v2i32 X))), (i64 (bitcast (v2i32 X)))) ->
20808 // (v2i64 (bitcast (concat_vectors (v2i32 X), (v2i32 X))))
20810 // If X is a build_vector itself, the concat can become a larger build_vector.
20811 // TODO: Maybe this is useful for non-splat too?
20812 if (!LegalOperations) {
20813 if (SDValue Splat = cast<BuildVectorSDNode>(N)->getSplatValue()) {
20814 Splat = peekThroughBitcasts(Splat);
20815 EVT SrcVT = Splat.getValueType();
20816 if (SrcVT.isVector()) {
20817 unsigned NumElts = N->getNumOperands() * SrcVT.getVectorNumElements();
20818 EVT NewVT = EVT::getVectorVT(*DAG.getContext(),
20819 SrcVT.getVectorElementType(), NumElts);
20820 if (!LegalTypes || TLI.isTypeLegal(NewVT)) {
20821 SmallVector<SDValue, 8> Ops(N->getNumOperands(), Splat);
20822 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N),
20824 return DAG.getBitcast(VT, Concat);
20830 // Check if we can express BUILD VECTOR via subvector extract.
20831 if (!LegalTypes && (N->getNumOperands() > 1)) {
20832 SDValue Op0 = N->getOperand(0);
20833 auto checkElem = [&](SDValue Op) -> uint64_t {
20834 if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
20835 (Op0.getOperand(0) == Op.getOperand(0)))
20836 if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
20837 return CNode->getZExtValue();
20841 int Offset = checkElem(Op0);
20842 for (unsigned i = 0; i < N->getNumOperands(); ++i) {
20843 if (Offset + i != checkElem(N->getOperand(i))) {
20849 if ((Offset == 0) &&
20850 (Op0.getOperand(0).getValueType() == N->getValueType(0)))
20851 return Op0.getOperand(0);
20852 if ((Offset != -1) &&
20853 ((Offset % N->getValueType(0).getVectorNumElements()) ==
20854 0)) // IDX must be multiple of output size.
20855 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
20856 Op0.getOperand(0), Op0.getOperand(1));
20859 if (SDValue V = convertBuildVecZextToZext(N))
20862 if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
20865 if (SDValue V = reduceBuildVecTruncToBitCast(N))
20868 if (SDValue V = reduceBuildVecToShuffle(N))
20871 // A splat of a single element is a SPLAT_VECTOR if supported on the target.
20872 // Do this late as some of the above may replace the splat.
20873 if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand)
20874 if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
20875 assert(!V.isUndef() && "Splat of undef should have been handled earlier");
20876 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V);
20882 static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
20883 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20884 EVT OpVT = N->getOperand(0).getValueType();
20886 // If the operands are legal vectors, leave them alone.
20887 if (TLI.isTypeLegal(OpVT))
20891 EVT VT = N->getValueType(0);
20892 SmallVector<SDValue, 8> Ops;
20894 EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
20895 SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
20897 // Keep track of what we encounter.
20898 bool AnyInteger = false;
20899 bool AnyFP = false;
20900 for (const SDValue &Op : N->ops()) {
20901 if (ISD::BITCAST == Op.getOpcode() &&
20902 !Op.getOperand(0).getValueType().isVector())
20903 Ops.push_back(Op.getOperand(0));
20904 else if (ISD::UNDEF == Op.getOpcode())
20905 Ops.push_back(ScalarUndef);
20909 // Note whether we encounter an integer or floating point scalar.
20910 // If it's neither, bail out, it could be something weird like x86mmx.
20911 EVT LastOpVT = Ops.back().getValueType();
20912 if (LastOpVT.isFloatingPoint())
20914 else if (LastOpVT.isInteger())
20920 // If any of the operands is a floating point scalar bitcast to a vector,
20921 // use floating point types throughout, and bitcast everything.
20922 // Replace UNDEFs by another scalar UNDEF node, of the final desired type.
20924 SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
20925 ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
20927 for (SDValue &Op : Ops) {
20928 if (Op.getValueType() == SVT)
20933 Op = DAG.getBitcast(SVT, Op);
20938 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
20939 VT.getSizeInBits() / SVT.getSizeInBits());
20940 return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
20943 // Attempt to merge nested concat_vectors/undefs.
20944 // Fold concat_vectors(concat_vectors(x,y,z,w),u,u,concat_vectors(a,b,c,d))
20945 // --> concat_vectors(x,y,z,w,u,u,u,u,u,u,u,u,a,b,c,d)
20946 static SDValue combineConcatVectorOfConcatVectors(SDNode *N,
20947 SelectionDAG &DAG) {
20948 EVT VT = N->getValueType(0);
20950 // Ensure we're concatenating UNDEF and CONCAT_VECTORS nodes of similar types.
20952 SDValue FirstConcat;
20953 for (const SDValue &Op : N->ops()) {
20956 if (Op.getOpcode() != ISD::CONCAT_VECTORS)
20958 if (!FirstConcat) {
20959 SubVT = Op.getOperand(0).getValueType();
20960 if (!DAG.getTargetLoweringInfo().isTypeLegal(SubVT))
20965 if (SubVT != Op.getOperand(0).getValueType())
20968 assert(FirstConcat && "Concat of all-undefs found");
20970 SmallVector<SDValue> ConcatOps;
20971 for (const SDValue &Op : N->ops()) {
20972 if (Op.isUndef()) {
20973 ConcatOps.append(FirstConcat->getNumOperands(), DAG.getUNDEF(SubVT));
20976 ConcatOps.append(Op->op_begin(), Op->op_end());
20978 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, ConcatOps);
20981 // Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
20982 // operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
20983 // most two distinct vectors the same size as the result, attempt to turn this
20984 // into a legal shuffle.
20985 static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
20986 EVT VT = N->getValueType(0);
20987 EVT OpVT = N->getOperand(0).getValueType();
20989 // We currently can't generate an appropriate shuffle for a scalable vector.
20990 if (VT.isScalableVector())
20993 int NumElts = VT.getVectorNumElements();
20994 int NumOpElts = OpVT.getVectorNumElements();
20996 SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
20997 SmallVector<int, 8> Mask;
20999 for (SDValue Op : N->ops()) {
21000 Op = peekThroughBitcasts(Op);
21002 // UNDEF nodes convert to UNDEF shuffle mask values.
21003 if (Op.isUndef()) {
21004 Mask.append((unsigned)NumOpElts, -1);
21008 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21011 // What vector are we extracting the subvector from and at what index?
21012 SDValue ExtVec = Op.getOperand(0);
21013 int ExtIdx = Op.getConstantOperandVal(1);
21015 // We want the EVT of the original extraction to correctly scale the
21016 // extraction index.
21017 EVT ExtVT = ExtVec.getValueType();
21018 ExtVec = peekThroughBitcasts(ExtVec);
21020 // UNDEF nodes convert to UNDEF shuffle mask values.
21021 if (ExtVec.isUndef()) {
21022 Mask.append((unsigned)NumOpElts, -1);
21026 // Ensure that we are extracting a subvector from a vector the same
21027 // size as the result.
21028 if (ExtVT.getSizeInBits() != VT.getSizeInBits())
21031 // Scale the subvector index to account for any bitcast.
21032 int NumExtElts = ExtVT.getVectorNumElements();
21033 if (0 == (NumExtElts % NumElts))
21034 ExtIdx /= (NumExtElts / NumElts);
21035 else if (0 == (NumElts % NumExtElts))
21036 ExtIdx *= (NumElts / NumExtElts);
21040 // At most we can reference 2 inputs in the final shuffle.
21041 if (SV0.isUndef() || SV0 == ExtVec) {
21043 for (int i = 0; i != NumOpElts; ++i)
21044 Mask.push_back(i + ExtIdx);
21045 } else if (SV1.isUndef() || SV1 == ExtVec) {
21047 for (int i = 0; i != NumOpElts; ++i)
21048 Mask.push_back(i + ExtIdx + NumElts);
21054 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21055 return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
21056 DAG.getBitcast(VT, SV1), Mask, DAG);
21059 static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
21060 unsigned CastOpcode = N->getOperand(0).getOpcode();
21061 switch (CastOpcode) {
21062 case ISD::SINT_TO_FP:
21063 case ISD::UINT_TO_FP:
21064 case ISD::FP_TO_SINT:
21065 case ISD::FP_TO_UINT:
21066 // TODO: Allow more opcodes?
21067 // case ISD::BITCAST:
21068 // case ISD::TRUNCATE:
21069 // case ISD::ZERO_EXTEND:
21070 // case ISD::SIGN_EXTEND:
21071 // case ISD::FP_EXTEND:
21077 EVT SrcVT = N->getOperand(0).getOperand(0).getValueType();
21078 if (!SrcVT.isVector())
21081 // All operands of the concat must be the same kind of cast from the same
21083 SmallVector<SDValue, 4> SrcOps;
21084 for (SDValue Op : N->ops()) {
21085 if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() ||
21086 Op.getOperand(0).getValueType() != SrcVT)
21088 SrcOps.push_back(Op.getOperand(0));
21091 // The wider cast must be supported by the target. This is unusual because
21092 // the operation support type parameter depends on the opcode. In addition,
21093 // check the other type in the cast to make sure this is really legal.
21094 EVT VT = N->getValueType(0);
21095 EVT SrcEltVT = SrcVT.getVectorElementType();
21096 ElementCount NumElts = SrcVT.getVectorElementCount() * N->getNumOperands();
21097 EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
21098 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21099 switch (CastOpcode) {
21100 case ISD::SINT_TO_FP:
21101 case ISD::UINT_TO_FP:
21102 if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) ||
21103 !TLI.isTypeLegal(VT))
21106 case ISD::FP_TO_SINT:
21107 case ISD::FP_TO_UINT:
21108 if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) ||
21109 !TLI.isTypeLegal(ConcatSrcVT))
21113 llvm_unreachable("Unexpected cast opcode");
21116 // concat (cast X), (cast Y)... -> cast (concat X, Y...)
21118 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps);
21119 return DAG.getNode(CastOpcode, DL, VT, NewConcat);
21122 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
21123 // If we only have one input vector, we don't need to do any concatenation.
21124 if (N->getNumOperands() == 1)
21125 return N->getOperand(0);
21127 // Check if all of the operands are undefs.
21128 EVT VT = N->getValueType(0);
21129 if (ISD::allOperandsUndef(N))
21130 return DAG.getUNDEF(VT);
21132 // Optimize concat_vectors where all but the first of the vectors are undef.
21133 if (all_of(drop_begin(N->ops()),
21134 [](const SDValue &Op) { return Op.isUndef(); })) {
21135 SDValue In = N->getOperand(0);
21136 assert(In.getValueType().isVector() && "Must concat vectors");
21138 // If the input is a concat_vectors, just make a larger concat by padding
21139 // with smaller undefs.
21140 if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) {
21141 unsigned NumOps = N->getNumOperands() * In.getNumOperands();
21142 SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
21143 Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
21144 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
21147 SDValue Scalar = peekThroughOneUseBitcasts(In);
21149 // concat_vectors(scalar_to_vector(scalar), undef) ->
21150 // scalar_to_vector(scalar)
21151 if (!LegalOperations && Scalar.getOpcode() == ISD::SCALAR_TO_VECTOR &&
21152 Scalar.hasOneUse()) {
21153 EVT SVT = Scalar.getValueType().getVectorElementType();
21154 if (SVT == Scalar.getOperand(0).getValueType())
21155 Scalar = Scalar.getOperand(0);
21158 // concat_vectors(scalar, undef) -> scalar_to_vector(scalar)
21159 if (!Scalar.getValueType().isVector()) {
21160 // If the bitcast type isn't legal, it might be a trunc of a legal type;
21161 // look through the trunc so we can still do the transform:
21162 // concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
21163 if (Scalar->getOpcode() == ISD::TRUNCATE &&
21164 !TLI.isTypeLegal(Scalar.getValueType()) &&
21165 TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
21166 Scalar = Scalar->getOperand(0);
21168 EVT SclTy = Scalar.getValueType();
21170 if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
21173 // Bail out if the vector size is not a multiple of the scalar size.
21174 if (VT.getSizeInBits() % SclTy.getSizeInBits())
21177 unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
21178 if (VNTNumElms < 2)
21181 EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
21182 if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
21185 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), NVT, Scalar);
21186 return DAG.getBitcast(VT, Res);
21190 // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
21191 // We have already tested above for an UNDEF only concatenation.
21192 // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
21193 // -> (BUILD_VECTOR A, B, ..., C, D, ...)
21194 auto IsBuildVectorOrUndef = [](const SDValue &Op) {
21195 return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
21197 if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
21198 SmallVector<SDValue, 8> Opnds;
21199 EVT SVT = VT.getScalarType();
21202 if (!SVT.isFloatingPoint()) {
21203 // If BUILD_VECTOR are from built from integer, they may have different
21204 // operand types. Get the smallest type and truncate all operands to it.
21205 bool FoundMinVT = false;
21206 for (const SDValue &Op : N->ops())
21207 if (ISD::BUILD_VECTOR == Op.getOpcode()) {
21208 EVT OpSVT = Op.getOperand(0).getValueType();
21209 MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
21212 assert(FoundMinVT && "Concat vector type mismatch");
21215 for (const SDValue &Op : N->ops()) {
21216 EVT OpVT = Op.getValueType();
21217 unsigned NumElts = OpVT.getVectorNumElements();
21219 if (ISD::UNDEF == Op.getOpcode())
21220 Opnds.append(NumElts, DAG.getUNDEF(MinVT));
21222 if (ISD::BUILD_VECTOR == Op.getOpcode()) {
21223 if (SVT.isFloatingPoint()) {
21224 assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
21225 Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
21227 for (unsigned i = 0; i != NumElts; ++i)
21229 DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
21234 assert(VT.getVectorNumElements() == Opnds.size() &&
21235 "Concat vector type mismatch");
21236 return DAG.getBuildVector(VT, SDLoc(N), Opnds);
21239 // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
21240 // FIXME: Add support for concat_vectors(bitcast(vec0),bitcast(vec1),...).
21241 if (SDValue V = combineConcatVectorOfScalars(N, DAG))
21244 if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) {
21245 // Fold CONCAT_VECTORS of CONCAT_VECTORS (or undef) to VECTOR_SHUFFLE.
21246 if (SDValue V = combineConcatVectorOfConcatVectors(N, DAG))
21249 // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
21250 if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
21254 if (SDValue V = combineConcatVectorOfCasts(N, DAG))
21257 // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
21258 // nodes often generate nop CONCAT_VECTOR nodes. Scan the CONCAT_VECTOR
21259 // operands and look for a CONCAT operations that place the incoming vectors
21260 // at the exact same location.
21262 // For scalable vectors, EXTRACT_SUBVECTOR indexes are implicitly scaled.
21263 SDValue SingleSource = SDValue();
21264 unsigned PartNumElem =
21265 N->getOperand(0).getValueType().getVectorMinNumElements();
21267 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
21268 SDValue Op = N->getOperand(i);
21273 // Check if this is the identity extract:
21274 if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
21277 // Find the single incoming vector for the extract_subvector.
21278 if (SingleSource.getNode()) {
21279 if (Op.getOperand(0) != SingleSource)
21282 SingleSource = Op.getOperand(0);
21284 // Check the source type is the same as the type of the result.
21285 // If not, this concat may extend the vector, so we can not
21286 // optimize it away.
21287 if (SingleSource.getValueType() != N->getValueType(0))
21291 // Check that we are reading from the identity index.
21292 unsigned IdentityIndex = i * PartNumElem;
21293 if (Op.getConstantOperandAPInt(1) != IdentityIndex)
21297 if (SingleSource.getNode())
21298 return SingleSource;
21303 // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
21304 // if the subvector can be sourced for free.
21305 static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
21306 if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
21307 V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) {
21308 return V.getOperand(1);
21310 auto *IndexC = dyn_cast<ConstantSDNode>(Index);
21311 if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
21312 V.getOperand(0).getValueType() == SubVT &&
21313 (IndexC->getZExtValue() % SubVT.getVectorMinNumElements()) == 0) {
21314 uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorMinNumElements();
21315 return V.getOperand(SubIdx);
21320 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
21322 bool LegalOperations) {
21323 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21324 SDValue BinOp = Extract->getOperand(0);
21325 unsigned BinOpcode = BinOp.getOpcode();
21326 if (!TLI.isBinOp(BinOpcode) || BinOp->getNumValues() != 1)
21329 EVT VecVT = BinOp.getValueType();
21330 SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1);
21331 if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType())
21334 SDValue Index = Extract->getOperand(1);
21335 EVT SubVT = Extract->getValueType(0);
21336 if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT, LegalOperations))
21339 SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
21340 SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT);
21342 // TODO: We could handle the case where only 1 operand is being inserted by
21343 // creating an extract of the other operand, but that requires checking
21344 // number of uses and/or costs.
21345 if (!Sub0 || !Sub1)
21348 // We are inserting both operands of the wide binop only to extract back
21349 // to the narrow vector size. Eliminate all of the insert/extract:
21350 // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y
21351 return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1,
21352 BinOp->getFlags());
21355 /// If we are extracting a subvector produced by a wide binary operator try
21356 /// to use a narrow binary operator and/or avoid concatenation and extraction.
21357 static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG,
21358 bool LegalOperations) {
21359 // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
21360 // some of these bailouts with other transforms.
21362 if (SDValue V = narrowInsertExtractVectorBinOp(Extract, DAG, LegalOperations))
21365 // The extract index must be a constant, so we can map it to a concat operand.
21366 auto *ExtractIndexC = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
21367 if (!ExtractIndexC)
21370 // We are looking for an optionally bitcasted wide vector binary operator
21371 // feeding an extract subvector.
21372 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21373 SDValue BinOp = peekThroughBitcasts(Extract->getOperand(0));
21374 unsigned BOpcode = BinOp.getOpcode();
21375 if (!TLI.isBinOp(BOpcode) || BinOp->getNumValues() != 1)
21378 // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
21379 // reduced to the unary fneg when it is visited, and we probably want to deal
21380 // with fneg in a target-specific way.
21381 if (BOpcode == ISD::FSUB) {
21382 auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true);
21383 if (C && C->getValueAPF().isNegZero())
21387 // The binop must be a vector type, so we can extract some fraction of it.
21388 EVT WideBVT = BinOp.getValueType();
21389 // The optimisations below currently assume we are dealing with fixed length
21390 // vectors. It is possible to add support for scalable vectors, but at the
21391 // moment we've done no analysis to prove whether they are profitable or not.
21392 if (!WideBVT.isFixedLengthVector())
21395 EVT VT = Extract->getValueType(0);
21396 unsigned ExtractIndex = ExtractIndexC->getZExtValue();
21397 assert(ExtractIndex % VT.getVectorNumElements() == 0 &&
21398 "Extract index is not a multiple of the vector length.");
21400 // Bail out if this is not a proper multiple width extraction.
21401 unsigned WideWidth = WideBVT.getSizeInBits();
21402 unsigned NarrowWidth = VT.getSizeInBits();
21403 if (WideWidth % NarrowWidth != 0)
21406 // Bail out if we are extracting a fraction of a single operation. This can
21407 // occur because we potentially looked through a bitcast of the binop.
21408 unsigned NarrowingRatio = WideWidth / NarrowWidth;
21409 unsigned WideNumElts = WideBVT.getVectorNumElements();
21410 if (WideNumElts % NarrowingRatio != 0)
21413 // Bail out if the target does not support a narrower version of the binop.
21414 EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
21415 WideNumElts / NarrowingRatio);
21416 if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
21419 // If extraction is cheap, we don't need to look at the binop operands
21420 // for concat ops. The narrow binop alone makes this transform profitable.
21421 // We can't just reuse the original extract index operand because we may have
21423 unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
21424 unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
21425 if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
21426 BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
21427 // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
21429 SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);
21430 SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21431 BinOp.getOperand(0), NewExtIndex);
21432 SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21433 BinOp.getOperand(1), NewExtIndex);
21434 SDValue NarrowBinOp =
21435 DAG.getNode(BOpcode, DL, NarrowBVT, X, Y, BinOp->getFlags());
21436 return DAG.getBitcast(VT, NarrowBinOp);
21439 // Only handle the case where we are doubling and then halving. A larger ratio
21440 // may require more than two narrow binops to replace the wide binop.
21441 if (NarrowingRatio != 2)
21444 // TODO: The motivating case for this transform is an x86 AVX1 target. That
21445 // target has temptingly almost legal versions of bitwise logic ops in 256-bit
21446 // flavors, but no other 256-bit integer support. This could be extended to
21447 // handle any binop, but that may require fixing/adding other folds to avoid
21448 // codegen regressions.
21449 if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
21452 // We need at least one concatenation operation of a binop operand to make
21453 // this transform worthwhile. The concat must double the input vector sizes.
21454 auto GetSubVector = [ConcatOpNum](SDValue V) -> SDValue {
21455 if (V.getOpcode() == ISD::CONCAT_VECTORS && V.getNumOperands() == 2)
21456 return V.getOperand(ConcatOpNum);
21459 SDValue SubVecL = GetSubVector(peekThroughBitcasts(BinOp.getOperand(0)));
21460 SDValue SubVecR = GetSubVector(peekThroughBitcasts(BinOp.getOperand(1)));
21462 if (SubVecL || SubVecR) {
21463 // If a binop operand was not the result of a concat, we must extract a
21464 // half-sized operand for our new narrow binop:
21465 // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
21466 // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC)
21467 // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN
21469 SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL);
21470 SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL)
21471 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21472 BinOp.getOperand(0), IndexC);
21474 SDValue Y = SubVecR ? DAG.getBitcast(NarrowBVT, SubVecR)
21475 : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
21476 BinOp.getOperand(1), IndexC);
21478 SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
21479 return DAG.getBitcast(VT, NarrowBinOp);
21485 /// If we are extracting a subvector from a wide vector load, convert to a
21486 /// narrow load to eliminate the extraction:
21487 /// (extract_subvector (load wide vector)) --> (load narrow vector)
21488 static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
21489 // TODO: Add support for big-endian. The offset calculation must be adjusted.
21490 if (DAG.getDataLayout().isBigEndian())
21493 auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
21494 if (!Ld || Ld->getExtensionType() || !Ld->isSimple())
21497 // Allow targets to opt-out.
21498 EVT VT = Extract->getValueType(0);
21500 // We can only create byte sized loads.
21501 if (!VT.isByteSized())
21504 unsigned Index = Extract->getConstantOperandVal(1);
21505 unsigned NumElts = VT.getVectorMinNumElements();
21507 // The definition of EXTRACT_SUBVECTOR states that the index must be a
21508 // multiple of the minimum number of elements in the result type.
21509 assert(Index % NumElts == 0 && "The extract subvector index is not a "
21510 "multiple of the result's element count");
21512 // It's fine to use TypeSize here as we know the offset will not be negative.
21513 TypeSize Offset = VT.getStoreSize() * (Index / NumElts);
21515 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21516 if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
21519 // The narrow load will be offset from the base address of the old load if
21520 // we are extracting from something besides index 0 (little-endian).
21523 // TODO: Use "BaseIndexOffset" to make this more effective.
21524 SDValue NewAddr = DAG.getMemBasePlusOffset(Ld->getBasePtr(), Offset, DL);
21526 uint64_t StoreSize = MemoryLocation::getSizeOrUnknown(VT.getStoreSize());
21527 MachineFunction &MF = DAG.getMachineFunction();
21528 MachineMemOperand *MMO;
21529 if (Offset.isScalable()) {
21530 MachinePointerInfo MPI =
21531 MachinePointerInfo(Ld->getPointerInfo().getAddrSpace());
21532 MMO = MF.getMachineMemOperand(Ld->getMemOperand(), MPI, StoreSize);
21534 MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset.getFixedSize(),
21537 SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
21538 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
21542 /// Given EXTRACT_SUBVECTOR(VECTOR_SHUFFLE(Op0, Op1, Mask)),
21543 /// try to produce VECTOR_SHUFFLE(EXTRACT_SUBVECTOR(Op?, ?),
21544 /// EXTRACT_SUBVECTOR(Op?, ?),
21546 /// iff it is legal and profitable to do so. Notably, the trimmed mask
21547 /// (containing only the elements that are extracted)
21548 /// must reference at most two subvectors.
21549 static SDValue foldExtractSubvectorFromShuffleVector(SDNode *N,
21551 const TargetLowering &TLI,
21552 bool LegalOperations) {
21553 assert(N->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
21554 "Must only be called on EXTRACT_SUBVECTOR's");
21556 SDValue N0 = N->getOperand(0);
21558 // Only deal with non-scalable vectors.
21559 EVT NarrowVT = N->getValueType(0);
21560 EVT WideVT = N0.getValueType();
21561 if (!NarrowVT.isFixedLengthVector() || !WideVT.isFixedLengthVector())
21564 // The operand must be a shufflevector.
21565 auto *WideShuffleVector = dyn_cast<ShuffleVectorSDNode>(N0);
21566 if (!WideShuffleVector)
21569 // The old shuffleneeds to go away.
21570 if (!WideShuffleVector->hasOneUse())
21573 // And the narrow shufflevector that we'll form must be legal.
21574 if (LegalOperations &&
21575 !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, NarrowVT))
21578 uint64_t FirstExtractedEltIdx = N->getConstantOperandVal(1);
21579 int NumEltsExtracted = NarrowVT.getVectorNumElements();
21580 assert((FirstExtractedEltIdx % NumEltsExtracted) == 0 &&
21581 "Extract index is not a multiple of the output vector length.");
21583 int WideNumElts = WideVT.getVectorNumElements();
21585 SmallVector<int, 16> NewMask;
21586 NewMask.reserve(NumEltsExtracted);
21587 SmallSetVector<std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>, 2>
21588 DemandedSubvectors;
21590 // Try to decode the wide mask into narrow mask from at most two subvectors.
21591 for (int M : WideShuffleVector->getMask().slice(FirstExtractedEltIdx,
21592 NumEltsExtracted)) {
21593 assert((M >= -1) && (M < (2 * WideNumElts)) &&
21594 "Out-of-bounds shuffle mask?");
21597 // Does not depend on operands, does not require adjustment.
21598 NewMask.emplace_back(M);
21602 // From which operand of the shuffle does this shuffle mask element pick?
21603 int WideShufOpIdx = M / WideNumElts;
21604 // Which element of that operand is picked?
21605 int OpEltIdx = M % WideNumElts;
21607 assert((OpEltIdx + WideShufOpIdx * WideNumElts) == M &&
21608 "Shuffle mask vector decomposition failure.");
21610 // And which NumEltsExtracted-sized subvector of that operand is that?
21611 int OpSubvecIdx = OpEltIdx / NumEltsExtracted;
21612 // And which element within that subvector of that operand is that?
21613 int OpEltIdxInSubvec = OpEltIdx % NumEltsExtracted;
21615 assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted) == OpEltIdx &&
21616 "Shuffle mask subvector decomposition failure.");
21618 assert((OpEltIdxInSubvec + OpSubvecIdx * NumEltsExtracted +
21619 WideShufOpIdx * WideNumElts) == M &&
21620 "Shuffle mask full decomposition failure.");
21622 SDValue Op = WideShuffleVector->getOperand(WideShufOpIdx);
21624 if (Op.isUndef()) {
21625 // Picking from an undef operand. Let's adjust mask instead.
21626 NewMask.emplace_back(-1);
21630 // Profitability check: only deal with extractions from the first subvector.
21631 if (OpSubvecIdx != 0)
21634 const std::pair<SDValue, int> DemandedSubvector =
21635 std::make_pair(Op, OpSubvecIdx);
21637 if (DemandedSubvectors.insert(DemandedSubvector)) {
21638 if (DemandedSubvectors.size() > 2)
21639 return SDValue(); // We can't handle more than two subvectors.
21640 // How many elements into the WideVT does this subvector start?
21641 int Index = NumEltsExtracted * OpSubvecIdx;
21642 // Bail out if the extraction isn't going to be cheap.
21643 if (!TLI.isExtractSubvectorCheap(NarrowVT, WideVT, Index))
21647 // Ok, but from which operand of the new shuffle will this element pick?
21649 getFirstIndexOf(DemandedSubvectors.getArrayRef(), DemandedSubvector);
21650 assert((NewOpIdx == 0 || NewOpIdx == 1) && "Unexpected operand index.");
21652 int AdjM = OpEltIdxInSubvec + NewOpIdx * NumEltsExtracted;
21653 NewMask.emplace_back(AdjM);
21655 assert(NewMask.size() == (unsigned)NumEltsExtracted && "Produced bad mask.");
21656 assert(DemandedSubvectors.size() <= 2 &&
21657 "Should have ended up demanding at most two subvectors.");
21659 // Did we discover that the shuffle does not actually depend on operands?
21660 if (DemandedSubvectors.empty())
21661 return DAG.getUNDEF(NarrowVT);
21663 // We still perform the exact same EXTRACT_SUBVECTOR, just on different
21664 // operand[s]/index[es], so there is no point in checking for it's legality.
21666 // Do not turn a legal shuffle into an illegal one.
21667 if (TLI.isShuffleMaskLegal(WideShuffleVector->getMask(), WideVT) &&
21668 !TLI.isShuffleMaskLegal(NewMask, NarrowVT))
21673 SmallVector<SDValue, 2> NewOps;
21674 for (const std::pair<SDValue /*Op*/, int /*SubvectorIndex*/>
21675 &DemandedSubvector : DemandedSubvectors) {
21676 // How many elements into the WideVT does this subvector start?
21677 int Index = NumEltsExtracted * DemandedSubvector.second;
21678 SDValue IndexC = DAG.getVectorIdxConstant(Index, DL);
21679 NewOps.emplace_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowVT,
21680 DemandedSubvector.first, IndexC));
21682 assert((NewOps.size() == 1 || NewOps.size() == 2) &&
21683 "Should end up with either one or two ops");
21685 // If we ended up with only one operand, pad with an undef.
21686 if (NewOps.size() == 1)
21687 NewOps.emplace_back(DAG.getUNDEF(NarrowVT));
21689 return DAG.getVectorShuffle(NarrowVT, DL, NewOps[0], NewOps[1], NewMask);
21692 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
21693 EVT NVT = N->getValueType(0);
21694 SDValue V = N->getOperand(0);
21695 uint64_t ExtIdx = N->getConstantOperandVal(1);
21697 // Extract from UNDEF is UNDEF.
21699 return DAG.getUNDEF(NVT);
21701 if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
21702 if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
21705 // Combine an extract of an extract into a single extract_subvector.
21706 // ext (ext X, C), 0 --> ext X, C
21707 if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
21708 if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
21709 V.getConstantOperandVal(1)) &&
21710 TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
21711 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT, V.getOperand(0),
21716 // ty1 extract_vector(ty2 splat(V))) -> ty1 splat(V)
21717 if (V.getOpcode() == ISD::SPLAT_VECTOR)
21718 if (DAG.isConstantValueOfAnyType(V.getOperand(0)) || V.hasOneUse())
21719 if (!LegalOperations || TLI.isOperationLegal(ISD::SPLAT_VECTOR, NVT))
21720 return DAG.getSplatVector(NVT, SDLoc(N), V.getOperand(0));
21722 // Try to move vector bitcast after extract_subv by scaling extraction index:
21723 // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
21724 if (V.getOpcode() == ISD::BITCAST &&
21725 V.getOperand(0).getValueType().isVector() &&
21726 (!LegalOperations || TLI.isOperationLegal(ISD::BITCAST, NVT))) {
21727 SDValue SrcOp = V.getOperand(0);
21728 EVT SrcVT = SrcOp.getValueType();
21729 unsigned SrcNumElts = SrcVT.getVectorMinNumElements();
21730 unsigned DestNumElts = V.getValueType().getVectorMinNumElements();
21731 if ((SrcNumElts % DestNumElts) == 0) {
21732 unsigned SrcDestRatio = SrcNumElts / DestNumElts;
21733 ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio;
21734 EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
21736 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
21738 SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL);
21739 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
21740 V.getOperand(0), NewIndex);
21741 return DAG.getBitcast(NVT, NewExtract);
21744 if ((DestNumElts % SrcNumElts) == 0) {
21745 unsigned DestSrcRatio = DestNumElts / SrcNumElts;
21746 if (NVT.getVectorElementCount().isKnownMultipleOf(DestSrcRatio)) {
21747 ElementCount NewExtEC =
21748 NVT.getVectorElementCount().divideCoefficientBy(DestSrcRatio);
21749 EVT ScalarVT = SrcVT.getScalarType();
21750 if ((ExtIdx % DestSrcRatio) == 0) {
21752 unsigned IndexValScaled = ExtIdx / DestSrcRatio;
21754 EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC);
21755 if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
21756 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
21757 SDValue NewExtract =
21758 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
21759 V.getOperand(0), NewIndex);
21760 return DAG.getBitcast(NVT, NewExtract);
21762 if (NewExtEC.isScalar() &&
21763 TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
21764 SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
21765 SDValue NewExtract =
21766 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
21767 V.getOperand(0), NewIndex);
21768 return DAG.getBitcast(NVT, NewExtract);
21775 if (V.getOpcode() == ISD::CONCAT_VECTORS) {
21776 unsigned ExtNumElts = NVT.getVectorMinNumElements();
21777 EVT ConcatSrcVT = V.getOperand(0).getValueType();
21778 assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
21779 "Concat and extract subvector do not change element type");
21780 assert((ExtIdx % ExtNumElts) == 0 &&
21781 "Extract index is not a multiple of the input vector length.");
21783 unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
21784 unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;
21786 // If the concatenated source types match this extract, it's a direct
21788 // extract_subvec (concat V1, V2, ...), i --> Vi
21789 if (NVT.getVectorElementCount() == ConcatSrcVT.getVectorElementCount())
21790 return V.getOperand(ConcatOpIdx);
21792 // If the concatenated source vectors are a multiple length of this extract,
21793 // then extract a fraction of one of those source vectors directly from a
21794 // concat operand. Example:
21795 // v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
21796 // v2i8 extract_subvec v8i8 Y, 6
21797 if (NVT.isFixedLengthVector() && ConcatSrcVT.isFixedLengthVector() &&
21798 ConcatSrcNumElts % ExtNumElts == 0) {
21800 unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
21801 assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
21802 "Trying to extract from >1 concat operand?");
21803 assert(NewExtIdx % ExtNumElts == 0 &&
21804 "Extract index is not a multiple of the input vector length.");
21805 SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
21806 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
21807 V.getOperand(ConcatOpIdx), NewIndexC);
21812 foldExtractSubvectorFromShuffleVector(N, DAG, TLI, LegalOperations))
21815 V = peekThroughBitcasts(V);
21817 // If the input is a build vector. Try to make a smaller build vector.
21818 if (V.getOpcode() == ISD::BUILD_VECTOR) {
21819 EVT InVT = V.getValueType();
21820 unsigned ExtractSize = NVT.getSizeInBits();
21821 unsigned EltSize = InVT.getScalarSizeInBits();
21822 // Only do this if we won't split any elements.
21823 if (ExtractSize % EltSize == 0) {
21824 unsigned NumElems = ExtractSize / EltSize;
21825 EVT EltVT = InVT.getVectorElementType();
21827 NumElems == 1 ? EltVT
21828 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
21829 if ((Level < AfterLegalizeDAG ||
21831 TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
21832 (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
21833 unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize;
21835 if (NumElems == 1) {
21836 SDValue Src = V->getOperand(IdxVal);
21837 if (EltVT != Src.getValueType())
21838 Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
21839 return DAG.getBitcast(NVT, Src);
21842 // Extract the pieces from the original build_vector.
21843 SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
21844 V->ops().slice(IdxVal, NumElems));
21845 return DAG.getBitcast(NVT, BuildVec);
21850 if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
21851 // Handle only simple case where vector being inserted and vector
21852 // being extracted are of same size.
21853 EVT SmallVT = V.getOperand(1).getValueType();
21854 if (!NVT.bitsEq(SmallVT))
21858 // (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
21860 // indices are equal or bit offsets are equal => V1
21861 // otherwise => (extract_subvec V1, ExtIdx)
21862 uint64_t InsIdx = V.getConstantOperandVal(2);
21863 if (InsIdx * SmallVT.getScalarSizeInBits() ==
21864 ExtIdx * NVT.getScalarSizeInBits()) {
21865 if (LegalOperations && !TLI.isOperationLegal(ISD::BITCAST, NVT))
21868 return DAG.getBitcast(NVT, V.getOperand(1));
21870 return DAG.getNode(
21871 ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
21872 DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
21876 if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG, LegalOperations))
21879 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
21880 return SDValue(N, 0);
21885 /// Try to convert a wide shuffle of concatenated vectors into 2 narrow shuffles
21886 /// followed by concatenation. Narrow vector ops may have better performance
21887 /// than wide ops, and this can unlock further narrowing of other vector ops.
21888 /// Targets can invert this transform later if it is not profitable.
21889 static SDValue foldShuffleOfConcatUndefs(ShuffleVectorSDNode *Shuf,
21890 SelectionDAG &DAG) {
21891 SDValue N0 = Shuf->getOperand(0), N1 = Shuf->getOperand(1);
21892 if (N0.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
21893 N1.getOpcode() != ISD::CONCAT_VECTORS || N1.getNumOperands() != 2 ||
21894 !N0.getOperand(1).isUndef() || !N1.getOperand(1).isUndef())
21897 // Split the wide shuffle mask into halves. Any mask element that is accessing
21898 // operand 1 is offset down to account for narrowing of the vectors.
21899 ArrayRef<int> Mask = Shuf->getMask();
21900 EVT VT = Shuf->getValueType(0);
21901 unsigned NumElts = VT.getVectorNumElements();
21902 unsigned HalfNumElts = NumElts / 2;
21903 SmallVector<int, 16> Mask0(HalfNumElts, -1);
21904 SmallVector<int, 16> Mask1(HalfNumElts, -1);
21905 for (unsigned i = 0; i != NumElts; ++i) {
21908 // If we reference the upper (undef) subvector then the element is undef.
21909 if ((Mask[i] % NumElts) >= HalfNumElts)
21911 int M = Mask[i] < (int)NumElts ? Mask[i] : Mask[i] - (int)HalfNumElts;
21912 if (i < HalfNumElts)
21915 Mask1[i - HalfNumElts] = M;
21918 // Ask the target if this is a valid transform.
21919 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21920 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
21922 if (!TLI.isShuffleMaskLegal(Mask0, HalfVT) ||
21923 !TLI.isShuffleMaskLegal(Mask1, HalfVT))
21926 // shuffle (concat X, undef), (concat Y, undef), Mask -->
21927 // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1)
21928 SDValue X = N0.getOperand(0), Y = N1.getOperand(0);
21930 SDValue Shuf0 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask0);
21931 SDValue Shuf1 = DAG.getVectorShuffle(HalfVT, DL, X, Y, Mask1);
21932 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Shuf0, Shuf1);
21935 // Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
21936 // or turn a shuffle of a single concat into simpler shuffle then concat.
21937 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
21938 EVT VT = N->getValueType(0);
21939 unsigned NumElts = VT.getVectorNumElements();
21941 SDValue N0 = N->getOperand(0);
21942 SDValue N1 = N->getOperand(1);
21943 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
21944 ArrayRef<int> Mask = SVN->getMask();
21946 SmallVector<SDValue, 4> Ops;
21947 EVT ConcatVT = N0.getOperand(0).getValueType();
21948 unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
21949 unsigned NumConcats = NumElts / NumElemsPerConcat;
21951 auto IsUndefMaskElt = [](int i) { return i == -1; };
21953 // Special case: shuffle(concat(A,B)) can be more efficiently represented
21954 // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
21955 // half vector elements.
21956 if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
21957 llvm::all_of(Mask.slice(NumElemsPerConcat, NumElemsPerConcat),
21959 N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0),
21961 Mask.slice(0, NumElemsPerConcat));
21962 N1 = DAG.getUNDEF(ConcatVT);
21963 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
21966 // Look at every vector that's inserted. We're looking for exact
21967 // subvector-sized copies from a concatenated vector
21968 for (unsigned I = 0; I != NumConcats; ++I) {
21969 unsigned Begin = I * NumElemsPerConcat;
21970 ArrayRef<int> SubMask = Mask.slice(Begin, NumElemsPerConcat);
21972 // Make sure we're dealing with a copy.
21973 if (llvm::all_of(SubMask, IsUndefMaskElt)) {
21974 Ops.push_back(DAG.getUNDEF(ConcatVT));
21979 for (int i = 0; i != (int)NumElemsPerConcat; ++i) {
21980 if (IsUndefMaskElt(SubMask[i]))
21982 if ((SubMask[i] % (int)NumElemsPerConcat) != i)
21984 int EltOpIdx = SubMask[i] / NumElemsPerConcat;
21985 if (0 <= OpIdx && EltOpIdx != OpIdx)
21989 assert(0 <= OpIdx && "Unknown concat_vectors op");
21991 if (OpIdx < (int)N0.getNumOperands())
21992 Ops.push_back(N0.getOperand(OpIdx));
21994 Ops.push_back(N1.getOperand(OpIdx - N0.getNumOperands()));
21997 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
22000 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
22001 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
22003 // SHUFFLE(BUILD_VECTOR(), BUILD_VECTOR()) -> BUILD_VECTOR() is always
22004 // a simplification in some sense, but it isn't appropriate in general: some
22005 // BUILD_VECTORs are substantially cheaper than others. The general case
22006 // of a BUILD_VECTOR requires inserting each element individually (or
22007 // performing the equivalent in a temporary stack variable). A BUILD_VECTOR of
22008 // all constants is a single constant pool load. A BUILD_VECTOR where each
22009 // element is identical is a splat. A BUILD_VECTOR where most of the operands
22010 // are undef lowers to a small number of element insertions.
22012 // To deal with this, we currently use a bunch of mostly arbitrary heuristics.
22013 // We don't fold shuffles where one side is a non-zero constant, and we don't
22014 // fold shuffles if the resulting (non-splat) BUILD_VECTOR would have duplicate
22015 // non-constant operands. This seems to work out reasonably well in practice.
22016 static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
22018 const TargetLowering &TLI) {
22019 EVT VT = SVN->getValueType(0);
22020 unsigned NumElts = VT.getVectorNumElements();
22021 SDValue N0 = SVN->getOperand(0);
22022 SDValue N1 = SVN->getOperand(1);
22024 if (!N0->hasOneUse())
22027 // If only one of N1,N2 is constant, bail out if it is not ALL_ZEROS as
22028 // discussed above.
22029 if (!N1.isUndef()) {
22030 if (!N1->hasOneUse())
22033 bool N0AnyConst = isAnyConstantBuildVector(N0);
22034 bool N1AnyConst = isAnyConstantBuildVector(N1);
22035 if (N0AnyConst && !N1AnyConst && !ISD::isBuildVectorAllZeros(N0.getNode()))
22037 if (!N0AnyConst && N1AnyConst && !ISD::isBuildVectorAllZeros(N1.getNode()))
22041 // If both inputs are splats of the same value then we can safely merge this
22042 // to a single BUILD_VECTOR with undef elements based on the shuffle mask.
22043 bool IsSplat = false;
22044 auto *BV0 = dyn_cast<BuildVectorSDNode>(N0);
22045 auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
22047 if (SDValue Splat0 = BV0->getSplatValue())
22048 IsSplat = (Splat0 == BV1->getSplatValue());
22050 SmallVector<SDValue, 8> Ops;
22051 SmallSet<SDValue, 16> DuplicateOps;
22052 for (int M : SVN->getMask()) {
22053 SDValue Op = DAG.getUNDEF(VT.getScalarType());
22055 int Idx = M < (int)NumElts ? M : M - NumElts;
22056 SDValue &S = (M < (int)NumElts ? N0 : N1);
22057 if (S.getOpcode() == ISD::BUILD_VECTOR) {
22058 Op = S.getOperand(Idx);
22059 } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR) {
22060 SDValue Op0 = S.getOperand(0);
22061 Op = Idx == 0 ? Op0 : DAG.getUNDEF(Op0.getValueType());
22063 // Operand can't be combined - bail out.
22068 // Don't duplicate a non-constant BUILD_VECTOR operand unless we're
22069 // generating a splat; semantically, this is fine, but it's likely to
22070 // generate low-quality code if the target can't reconstruct an appropriate
22072 if (!Op.isUndef() && !isIntOrFPConstant(Op))
22073 if (!IsSplat && !DuplicateOps.insert(Op).second)
22079 // BUILD_VECTOR requires all inputs to be of the same type, find the
22080 // maximum type and extend them all.
22081 EVT SVT = VT.getScalarType();
22082 if (SVT.isInteger())
22083 for (SDValue &Op : Ops)
22084 SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
22085 if (SVT != VT.getScalarType())
22086 for (SDValue &Op : Ops)
22087 Op = Op.isUndef() ? DAG.getUNDEF(SVT)
22088 : (TLI.isZExtFree(Op.getValueType(), SVT)
22089 ? DAG.getZExtOrTrunc(Op, SDLoc(SVN), SVT)
22090 : DAG.getSExtOrTrunc(Op, SDLoc(SVN), SVT));
22091 return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
22094 // Match shuffles that can be converted to any_vector_extend_in_reg.
22095 // This is often generated during legalization.
22096 // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
22097 // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
22098 static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
22100 const TargetLowering &TLI,
22101 bool LegalOperations) {
22102 EVT VT = SVN->getValueType(0);
22103 bool IsBigEndian = DAG.getDataLayout().isBigEndian();
22105 // TODO Add support for big-endian when we have a test case.
22106 if (!VT.isInteger() || IsBigEndian)
22109 unsigned NumElts = VT.getVectorNumElements();
22110 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22111 ArrayRef<int> Mask = SVN->getMask();
22112 SDValue N0 = SVN->getOperand(0);
22114 // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
22115 auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
22116 for (unsigned i = 0; i != NumElts; ++i) {
22119 if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
22126 // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
22127 // power-of-2 extensions as they are the most likely.
22128 for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
22129 // Check for non power of 2 vector sizes
22130 if (NumElts % Scale != 0)
22132 if (!isAnyExtend(Scale))
22135 EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
22136 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
22137 // Never create an illegal type. Only create unsupported operations if we
22138 // are pre-legalization.
22139 if (TLI.isTypeLegal(OutVT))
22140 if (!LegalOperations ||
22141 TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
22142 return DAG.getBitcast(VT,
22143 DAG.getNode(ISD::ANY_EXTEND_VECTOR_INREG,
22144 SDLoc(SVN), OutVT, N0));
22150 // Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
22151 // each source element of a large type into the lowest elements of a smaller
22152 // destination type. This is often generated during legalization.
22153 // If the source node itself was a '*_extend_vector_inreg' node then we should
22154 // then be able to remove it.
22155 static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
22156 SelectionDAG &DAG) {
22157 EVT VT = SVN->getValueType(0);
22158 bool IsBigEndian = DAG.getDataLayout().isBigEndian();
22160 // TODO Add support for big-endian when we have a test case.
22161 if (!VT.isInteger() || IsBigEndian)
22164 SDValue N0 = peekThroughBitcasts(SVN->getOperand(0));
22166 unsigned Opcode = N0.getOpcode();
22167 if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
22168 Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
22169 Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
22172 SDValue N00 = N0.getOperand(0);
22173 ArrayRef<int> Mask = SVN->getMask();
22174 unsigned NumElts = VT.getVectorNumElements();
22175 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22176 unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
22177 unsigned ExtDstSizeInBits = N0.getScalarValueSizeInBits();
22179 if (ExtDstSizeInBits % ExtSrcSizeInBits != 0)
22181 unsigned ExtScale = ExtDstSizeInBits / ExtSrcSizeInBits;
22183 // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
22184 // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
22185 // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
22186 auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
22187 for (unsigned i = 0; i != NumElts; ++i) {
22190 if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
22197 // At the moment we just handle the case where we've truncated back to the
22198 // same size as before the extension.
22199 // TODO: handle more extension/truncation cases as cases arise.
22200 if (EltSizeInBits != ExtSrcSizeInBits)
22203 // We can remove *extend_vector_inreg only if the truncation happens at
22204 // the same scale as the extension.
22205 if (isTruncate(ExtScale))
22206 return DAG.getBitcast(VT, N00);
22211 // Combine shuffles of splat-shuffles of the form:
22212 // shuffle (shuffle V, undef, splat-mask), undef, M
22213 // If splat-mask contains undef elements, we need to be careful about
22214 // introducing undef's in the folded mask which are not the result of composing
22215 // the masks of the shuffles.
22216 static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
22217 SelectionDAG &DAG) {
22218 if (!Shuf->getOperand(1).isUndef())
22221 // If the inner operand is a known splat with no undefs, just return that directly.
22222 // TODO: Create DemandedElts mask from Shuf's mask.
22223 // TODO: Allow undef elements and merge with the shuffle code below.
22224 if (DAG.isSplatValue(Shuf->getOperand(0), /*AllowUndefs*/ false))
22225 return Shuf->getOperand(0);
22227 auto *Splat = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
22228 if (!Splat || !Splat->isSplat())
22231 ArrayRef<int> ShufMask = Shuf->getMask();
22232 ArrayRef<int> SplatMask = Splat->getMask();
22233 assert(ShufMask.size() == SplatMask.size() && "Mask length mismatch");
22235 // Prefer simplifying to the splat-shuffle, if possible. This is legal if
22236 // every undef mask element in the splat-shuffle has a corresponding undef
22237 // element in the user-shuffle's mask or if the composition of mask elements
22238 // would result in undef.
22239 // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask):
22240 // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u]
22241 // In this case it is not legal to simplify to the splat-shuffle because we
22242 // may be exposing the users of the shuffle an undef element at index 1
22243 // which was not there before the combine.
22244 // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u]
22245 // In this case the composition of masks yields SplatMask, so it's ok to
22246 // simplify to the splat-shuffle.
22247 // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u]
22248 // In this case the composed mask includes all undef elements of SplatMask
22249 // and in addition sets element zero to undef. It is safe to simplify to
22250 // the splat-shuffle.
22251 auto CanSimplifyToExistingSplat = [](ArrayRef<int> UserMask,
22252 ArrayRef<int> SplatMask) {
22253 for (unsigned i = 0, e = UserMask.size(); i != e; ++i)
22254 if (UserMask[i] != -1 && SplatMask[i] == -1 &&
22255 SplatMask[UserMask[i]] != -1)
22259 if (CanSimplifyToExistingSplat(ShufMask, SplatMask))
22260 return Shuf->getOperand(0);
22262 // Create a new shuffle with a mask that is composed of the two shuffles'
22264 SmallVector<int, 32> NewMask;
22265 for (int Idx : ShufMask)
22266 NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]);
22268 return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat),
22269 Splat->getOperand(0), Splat->getOperand(1),
22273 // Combine shuffles of bitcasts into a shuffle of the bitcast type, providing
22274 // the mask can be treated as a larger type.
22275 static SDValue combineShuffleOfBitcast(ShuffleVectorSDNode *SVN,
22277 const TargetLowering &TLI,
22278 bool LegalOperations) {
22279 SDValue Op0 = SVN->getOperand(0);
22280 SDValue Op1 = SVN->getOperand(1);
22281 EVT VT = SVN->getValueType(0);
22282 if (Op0.getOpcode() != ISD::BITCAST)
22284 EVT InVT = Op0.getOperand(0).getValueType();
22285 if (!InVT.isVector() ||
22286 (!Op1.isUndef() && (Op1.getOpcode() != ISD::BITCAST ||
22287 Op1.getOperand(0).getValueType() != InVT)))
22289 if (isAnyConstantBuildVector(Op0.getOperand(0)) &&
22290 (Op1.isUndef() || isAnyConstantBuildVector(Op1.getOperand(0))))
22293 int VTLanes = VT.getVectorNumElements();
22294 int InLanes = InVT.getVectorNumElements();
22295 if (VTLanes <= InLanes || VTLanes % InLanes != 0 ||
22296 (LegalOperations &&
22297 !TLI.isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, InVT)))
22299 int Factor = VTLanes / InLanes;
22301 // Check that each group of lanes in the mask are either undef or make a valid
22302 // mask for the wider lane type.
22303 ArrayRef<int> Mask = SVN->getMask();
22304 SmallVector<int> NewMask;
22305 if (!widenShuffleMaskElts(Factor, Mask, NewMask))
22308 if (!TLI.isShuffleMaskLegal(NewMask, InVT))
22311 // Create the new shuffle with the new mask and bitcast it back to the
22314 Op0 = Op0.getOperand(0);
22315 Op1 = Op1.isUndef() ? DAG.getUNDEF(InVT) : Op1.getOperand(0);
22316 SDValue NewShuf = DAG.getVectorShuffle(InVT, DL, Op0, Op1, NewMask);
22317 return DAG.getBitcast(VT, NewShuf);
22320 /// Combine shuffle of shuffle of the form:
22321 /// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X
22322 static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,
22323 SelectionDAG &DAG) {
22324 if (!OuterShuf->getOperand(1).isUndef())
22326 auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0));
22327 if (!InnerShuf || !InnerShuf->getOperand(1).isUndef())
22330 ArrayRef<int> OuterMask = OuterShuf->getMask();
22331 ArrayRef<int> InnerMask = InnerShuf->getMask();
22332 unsigned NumElts = OuterMask.size();
22333 assert(NumElts == InnerMask.size() && "Mask length mismatch");
22334 SmallVector<int, 32> CombinedMask(NumElts, -1);
22335 int SplatIndex = -1;
22336 for (unsigned i = 0; i != NumElts; ++i) {
22337 // Undef lanes remain undef.
22338 int OuterMaskElt = OuterMask[i];
22339 if (OuterMaskElt == -1)
22342 // Peek through the shuffle masks to get the underlying source element.
22343 int InnerMaskElt = InnerMask[OuterMaskElt];
22344 if (InnerMaskElt == -1)
22347 // Initialize the splatted element.
22348 if (SplatIndex == -1)
22349 SplatIndex = InnerMaskElt;
22351 // Non-matching index - this is not a splat.
22352 if (SplatIndex != InnerMaskElt)
22355 CombinedMask[i] = InnerMaskElt;
22357 assert((all_of(CombinedMask, [](int M) { return M == -1; }) ||
22358 getSplatIndex(CombinedMask) != -1) &&
22359 "Expected a splat mask");
22361 // TODO: The transform may be a win even if the mask is not legal.
22362 EVT VT = OuterShuf->getValueType(0);
22363 assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types");
22364 if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT))
22367 return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0),
22368 InnerShuf->getOperand(1), CombinedMask);
22371 /// If the shuffle mask is taking exactly one element from the first vector
22372 /// operand and passing through all other elements from the second vector
22373 /// operand, return the index of the mask element that is choosing an element
22374 /// from the first operand. Otherwise, return -1.
22375 static int getShuffleMaskIndexOfOneElementFromOp0IntoOp1(ArrayRef<int> Mask) {
22376 int MaskSize = Mask.size();
22377 int EltFromOp0 = -1;
22378 // TODO: This does not match if there are undef elements in the shuffle mask.
22379 // Should we ignore undefs in the shuffle mask instead? The trade-off is
22380 // removing an instruction (a shuffle), but losing the knowledge that some
22381 // vector lanes are not needed.
22382 for (int i = 0; i != MaskSize; ++i) {
22383 if (Mask[i] >= 0 && Mask[i] < MaskSize) {
22384 // We're looking for a shuffle of exactly one element from operand 0.
22385 if (EltFromOp0 != -1)
22388 } else if (Mask[i] != i + MaskSize) {
22389 // Nothing from operand 1 can change lanes.
22396 /// If a shuffle inserts exactly one element from a source vector operand into
22397 /// another vector operand and we can access the specified element as a scalar,
22398 /// then we can eliminate the shuffle.
22399 static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
22400 SelectionDAG &DAG) {
22401 // First, check if we are taking one element of a vector and shuffling that
22402 // element into another vector.
22403 ArrayRef<int> Mask = Shuf->getMask();
22404 SmallVector<int, 16> CommutedMask(Mask.begin(), Mask.end());
22405 SDValue Op0 = Shuf->getOperand(0);
22406 SDValue Op1 = Shuf->getOperand(1);
22407 int ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(Mask);
22408 if (ShufOp0Index == -1) {
22409 // Commute mask and check again.
22410 ShuffleVectorSDNode::commuteMask(CommutedMask);
22411 ShufOp0Index = getShuffleMaskIndexOfOneElementFromOp0IntoOp1(CommutedMask);
22412 if (ShufOp0Index == -1)
22414 // Commute operands to match the commuted shuffle mask.
22415 std::swap(Op0, Op1);
22416 Mask = CommutedMask;
22419 // The shuffle inserts exactly one element from operand 0 into operand 1.
22420 // Now see if we can access that element as a scalar via a real insert element
22422 // TODO: We can try harder to locate the element as a scalar. Examples: it
22423 // could be an operand of SCALAR_TO_VECTOR, BUILD_VECTOR, or a constant.
22424 assert(Mask[ShufOp0Index] >= 0 && Mask[ShufOp0Index] < (int)Mask.size() &&
22425 "Shuffle mask value must be from operand 0");
22426 if (Op0.getOpcode() != ISD::INSERT_VECTOR_ELT)
22429 auto *InsIndexC = dyn_cast<ConstantSDNode>(Op0.getOperand(2));
22430 if (!InsIndexC || InsIndexC->getSExtValue() != Mask[ShufOp0Index])
22433 // There's an existing insertelement with constant insertion index, so we
22434 // don't need to check the legality/profitability of a replacement operation
22435 // that differs at most in the constant value. The target should be able to
22436 // lower any of those in a similar way. If not, legalization will expand this
22437 // to a scalar-to-vector plus shuffle.
22439 // Note that the shuffle may move the scalar from the position that the insert
22440 // element used. Therefore, our new insert element occurs at the shuffle's
22441 // mask index value, not the insert's index value.
22442 // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
22443 SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf));
22444 return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
22445 Op1, Op0.getOperand(1), NewInsIndex);
22448 /// If we have a unary shuffle of a shuffle, see if it can be folded away
22449 /// completely. This has the potential to lose undef knowledge because the first
22450 /// shuffle may not have an undef mask element where the second one does. So
22451 /// only call this after doing simplifications based on demanded elements.
22452 static SDValue simplifyShuffleOfShuffle(ShuffleVectorSDNode *Shuf) {
22453 // shuf (shuf0 X, Y, Mask0), undef, Mask
22454 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(Shuf->getOperand(0));
22455 if (!Shuf0 || !Shuf->getOperand(1).isUndef())
22458 ArrayRef<int> Mask = Shuf->getMask();
22459 ArrayRef<int> Mask0 = Shuf0->getMask();
22460 for (int i = 0, e = (int)Mask.size(); i != e; ++i) {
22461 // Ignore undef elements.
22464 assert(Mask[i] >= 0 && Mask[i] < e && "Unexpected shuffle mask value");
22466 // Is the element of the shuffle operand chosen by this shuffle the same as
22467 // the element chosen by the shuffle operand itself?
22468 if (Mask0[Mask[i]] != Mask0[i])
22471 // Every element of this shuffle is identical to the result of the previous
22472 // shuffle, so we can replace this value.
22473 return Shuf->getOperand(0);
22476 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
22477 EVT VT = N->getValueType(0);
22478 unsigned NumElts = VT.getVectorNumElements();
22480 SDValue N0 = N->getOperand(0);
22481 SDValue N1 = N->getOperand(1);
22483 assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
22485 // Canonicalize shuffle undef, undef -> undef
22486 if (N0.isUndef() && N1.isUndef())
22487 return DAG.getUNDEF(VT);
22489 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
22491 // Canonicalize shuffle v, v -> v, undef
22493 return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT),
22494 createUnaryMask(SVN->getMask(), NumElts));
22496 // Canonicalize shuffle undef, v -> v, undef. Commute the shuffle mask.
22498 return DAG.getCommutedVectorShuffle(*SVN);
22500 // Remove references to rhs if it is undef
22501 if (N1.isUndef()) {
22502 bool Changed = false;
22503 SmallVector<int, 8> NewMask;
22504 for (unsigned i = 0; i != NumElts; ++i) {
22505 int Idx = SVN->getMaskElt(i);
22506 if (Idx >= (int)NumElts) {
22510 NewMask.push_back(Idx);
22513 return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
22516 if (SDValue InsElt = replaceShuffleOfInsert(SVN, DAG))
22519 // A shuffle of a single vector that is a splatted value can always be folded.
22520 if (SDValue V = combineShuffleOfSplatVal(SVN, DAG))
22523 if (SDValue V = formSplatFromShuffles(SVN, DAG))
22526 // If it is a splat, check if the argument vector is another splat or a
22528 if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
22529 int SplatIndex = SVN->getSplatIndex();
22530 if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
22531 TLI.isBinOp(N0.getOpcode()) && N0->getNumValues() == 1) {
22532 // splat (vector_bo L, R), Index -->
22533 // splat (scalar_bo (extelt L, Index), (extelt R, Index))
22534 SDValue L = N0.getOperand(0), R = N0.getOperand(1);
22536 EVT EltVT = VT.getScalarType();
22537 SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
22538 SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
22539 SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
22541 DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR, N0->getFlags());
22542 SDValue Insert = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, NewBO);
22543 SmallVector<int, 16> ZeroMask(VT.getVectorNumElements(), 0);
22544 return DAG.getVectorShuffle(VT, DL, Insert, DAG.getUNDEF(VT), ZeroMask);
22547 // splat(scalar_to_vector(x), 0) -> build_vector(x,...,x)
22548 // splat(insert_vector_elt(v, x, c), c) -> build_vector(x,...,x)
22549 if ((!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT)) &&
22551 if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR && SplatIndex == 0)
22552 return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(0));
22554 if (N0.getOpcode() == ISD::INSERT_VECTOR_ELT)
22555 if (auto *Idx = dyn_cast<ConstantSDNode>(N0.getOperand(2)))
22556 if (Idx->getAPIntValue() == SplatIndex)
22557 return DAG.getSplatBuildVector(VT, SDLoc(N), N0.getOperand(1));
22560 // If this is a bit convert that changes the element type of the vector but
22561 // not the number of vector elements, look through it. Be careful not to
22562 // look though conversions that change things like v4f32 to v2f64.
22563 SDNode *V = N0.getNode();
22564 if (V->getOpcode() == ISD::BITCAST) {
22565 SDValue ConvInput = V->getOperand(0);
22566 if (ConvInput.getValueType().isVector() &&
22567 ConvInput.getValueType().getVectorNumElements() == NumElts)
22568 V = ConvInput.getNode();
22571 if (V->getOpcode() == ISD::BUILD_VECTOR) {
22572 assert(V->getNumOperands() == NumElts &&
22573 "BUILD_VECTOR has wrong number of operands");
22575 bool AllSame = true;
22576 for (unsigned i = 0; i != NumElts; ++i) {
22577 if (!V->getOperand(i).isUndef()) {
22578 Base = V->getOperand(i);
22582 // Splat of <u, u, u, u>, return <u, u, u, u>
22583 if (!Base.getNode())
22585 for (unsigned i = 0; i != NumElts; ++i) {
22586 if (V->getOperand(i) != Base) {
22591 // Splat of <x, x, x, x>, return <x, x, x, x>
22595 // Canonicalize any other splat as a build_vector.
22596 SDValue Splatted = V->getOperand(SplatIndex);
22597 SmallVector<SDValue, 8> Ops(NumElts, Splatted);
22598 SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
22600 // We may have jumped through bitcasts, so the type of the
22601 // BUILD_VECTOR may not match the type of the shuffle.
22602 if (V->getValueType(0) != VT)
22603 NewBV = DAG.getBitcast(VT, NewBV);
22608 // Simplify source operands based on shuffle mask.
22609 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
22610 return SDValue(N, 0);
22612 // This is intentionally placed after demanded elements simplification because
22613 // it could eliminate knowledge of undef elements created by this shuffle.
22614 if (SDValue ShufOp = simplifyShuffleOfShuffle(SVN))
22617 // Match shuffles that can be converted to any_vector_extend_in_reg.
22618 if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
22621 // Combine "truncate_vector_in_reg" style shuffles.
22622 if (SDValue V = combineTruncationShuffle(SVN, DAG))
22625 if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
22626 Level < AfterLegalizeVectorOps &&
22628 (N1.getOpcode() == ISD::CONCAT_VECTORS &&
22629 N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
22630 if (SDValue V = partitionShuffleOfConcats(N, DAG))
22634 // A shuffle of a concat of the same narrow vector can be reduced to use
22635 // only low-half elements of a concat with undef:
22636 // shuf (concat X, X), undef, Mask --> shuf (concat X, undef), undef, Mask'
22637 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N1.isUndef() &&
22638 N0.getNumOperands() == 2 &&
22639 N0.getOperand(0) == N0.getOperand(1)) {
22640 int HalfNumElts = (int)NumElts / 2;
22641 SmallVector<int, 8> NewMask;
22642 for (unsigned i = 0; i != NumElts; ++i) {
22643 int Idx = SVN->getMaskElt(i);
22644 if (Idx >= HalfNumElts) {
22645 assert(Idx < (int)NumElts && "Shuffle mask chooses undef op");
22646 Idx -= HalfNumElts;
22648 NewMask.push_back(Idx);
22650 if (TLI.isShuffleMaskLegal(NewMask, VT)) {
22651 SDValue UndefVec = DAG.getUNDEF(N0.getOperand(0).getValueType());
22652 SDValue NewCat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
22653 N0.getOperand(0), UndefVec);
22654 return DAG.getVectorShuffle(VT, SDLoc(N), NewCat, N1, NewMask);
22658 // See if we can replace a shuffle with an insert_subvector.
22659 // e.g. v2i32 into v8i32:
22660 // shuffle(lhs,concat(rhs0,rhs1,rhs2,rhs3),0,1,2,3,10,11,6,7).
22661 // --> insert_subvector(lhs,rhs1,4).
22662 if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT) &&
22663 TLI.isOperationLegalOrCustom(ISD::INSERT_SUBVECTOR, VT)) {
22664 auto ShuffleToInsert = [&](SDValue LHS, SDValue RHS, ArrayRef<int> Mask) {
22665 // Ensure RHS subvectors are legal.
22666 assert(RHS.getOpcode() == ISD::CONCAT_VECTORS && "Can't find subvectors");
22667 EVT SubVT = RHS.getOperand(0).getValueType();
22668 int NumSubVecs = RHS.getNumOperands();
22669 int NumSubElts = SubVT.getVectorNumElements();
22670 assert((NumElts % NumSubElts) == 0 && "Subvector mismatch");
22671 if (!TLI.isTypeLegal(SubVT))
22674 // Don't bother if we have an unary shuffle (matches undef + LHS elts).
22675 if (all_of(Mask, [NumElts](int M) { return M < (int)NumElts; }))
22678 // Search [NumSubElts] spans for RHS sequence.
22679 // TODO: Can we avoid nested loops to increase performance?
22680 SmallVector<int> InsertionMask(NumElts);
22681 for (int SubVec = 0; SubVec != NumSubVecs; ++SubVec) {
22682 for (int SubIdx = 0; SubIdx != (int)NumElts; SubIdx += NumSubElts) {
22683 // Reset mask to identity.
22684 std::iota(InsertionMask.begin(), InsertionMask.end(), 0);
22686 // Add subvector insertion.
22687 std::iota(InsertionMask.begin() + SubIdx,
22688 InsertionMask.begin() + SubIdx + NumSubElts,
22689 NumElts + (SubVec * NumSubElts));
22691 // See if the shuffle mask matches the reference insertion mask.
22692 bool MatchingShuffle = true;
22693 for (int i = 0; i != (int)NumElts; ++i) {
22694 int ExpectIdx = InsertionMask[i];
22695 int ActualIdx = Mask[i];
22696 if (0 <= ActualIdx && ExpectIdx != ActualIdx) {
22697 MatchingShuffle = false;
22702 if (MatchingShuffle)
22703 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, LHS,
22704 RHS.getOperand(SubVec),
22705 DAG.getVectorIdxConstant(SubIdx, SDLoc(N)));
22710 ArrayRef<int> Mask = SVN->getMask();
22711 if (N1.getOpcode() == ISD::CONCAT_VECTORS)
22712 if (SDValue InsertN1 = ShuffleToInsert(N0, N1, Mask))
22714 if (N0.getOpcode() == ISD::CONCAT_VECTORS) {
22715 SmallVector<int> CommuteMask(Mask.begin(), Mask.end());
22716 ShuffleVectorSDNode::commuteMask(CommuteMask);
22717 if (SDValue InsertN0 = ShuffleToInsert(N1, N0, CommuteMask))
22722 // If we're not performing a select/blend shuffle, see if we can convert the
22723 // shuffle into a AND node, with all the out-of-lane elements are known zero.
22724 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
22725 bool IsInLaneMask = true;
22726 ArrayRef<int> Mask = SVN->getMask();
22727 SmallVector<int, 16> ClearMask(NumElts, -1);
22728 APInt DemandedLHS = APInt::getNullValue(NumElts);
22729 APInt DemandedRHS = APInt::getNullValue(NumElts);
22730 for (int I = 0; I != (int)NumElts; ++I) {
22734 ClearMask[I] = M == I ? I : (I + NumElts);
22735 IsInLaneMask &= (M == I) || (M == (int)(I + NumElts));
22737 APInt &Demanded = M < (int)NumElts ? DemandedLHS : DemandedRHS;
22738 Demanded.setBit(M % NumElts);
22741 // TODO: Should we try to mask with N1 as well?
22742 if (!IsInLaneMask &&
22743 (!DemandedLHS.isNullValue() || !DemandedRHS.isNullValue()) &&
22744 (DemandedLHS.isNullValue() ||
22745 DAG.MaskedVectorIsZero(N0, DemandedLHS)) &&
22746 (DemandedRHS.isNullValue() ||
22747 DAG.MaskedVectorIsZero(N1, DemandedRHS))) {
22749 EVT IntVT = VT.changeVectorElementTypeToInteger();
22750 EVT IntSVT = VT.getVectorElementType().changeTypeToInteger();
22751 SDValue ZeroElt = DAG.getConstant(0, DL, IntSVT);
22752 SDValue AllOnesElt = DAG.getAllOnesConstant(DL, IntSVT);
22753 SmallVector<SDValue, 16> AndMask(NumElts, DAG.getUNDEF(IntSVT));
22754 for (int I = 0; I != (int)NumElts; ++I)
22756 AndMask[I] = Mask[I] == I ? AllOnesElt : ZeroElt;
22758 // See if a clear mask is legal instead of going via
22759 // XformToShuffleWithZero which loses UNDEF mask elements.
22760 if (TLI.isVectorClearMaskLegal(ClearMask, IntVT))
22761 return DAG.getBitcast(
22762 VT, DAG.getVectorShuffle(IntVT, DL, DAG.getBitcast(IntVT, N0),
22763 DAG.getConstant(0, DL, IntVT), ClearMask));
22765 if (TLI.isOperationLegalOrCustom(ISD::AND, IntVT))
22766 return DAG.getBitcast(
22767 VT, DAG.getNode(ISD::AND, DL, IntVT, DAG.getBitcast(IntVT, N0),
22768 DAG.getBuildVector(IntVT, DL, AndMask)));
22772 // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
22773 // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
22774 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT))
22775 if (SDValue Res = combineShuffleOfScalars(SVN, DAG, TLI))
22778 // If this shuffle only has a single input that is a bitcasted shuffle,
22779 // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
22780 // back to their original types.
22781 if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
22782 N1.isUndef() && Level < AfterLegalizeVectorOps &&
22783 TLI.isTypeLegal(VT)) {
22785 SDValue BC0 = peekThroughOneUseBitcasts(N0);
22786 if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
22787 EVT SVT = VT.getScalarType();
22788 EVT InnerVT = BC0->getValueType(0);
22789 EVT InnerSVT = InnerVT.getScalarType();
22791 // Determine which shuffle works with the smaller scalar type.
22792 EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
22793 EVT ScaleSVT = ScaleVT.getScalarType();
22795 if (TLI.isTypeLegal(ScaleVT) &&
22796 0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
22797 0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
22798 int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
22799 int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
22801 // Scale the shuffle masks to the smaller scalar type.
22802 ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
22803 SmallVector<int, 8> InnerMask;
22804 SmallVector<int, 8> OuterMask;
22805 narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask);
22806 narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask);
22808 // Merge the shuffle masks.
22809 SmallVector<int, 8> NewMask;
22810 for (int M : OuterMask)
22811 NewMask.push_back(M < 0 ? -1 : InnerMask[M]);
22813 // Test for shuffle mask legality over both commutations.
22814 SDValue SV0 = BC0->getOperand(0);
22815 SDValue SV1 = BC0->getOperand(1);
22816 bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
22818 std::swap(SV0, SV1);
22819 ShuffleVectorSDNode::commuteMask(NewMask);
22820 LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
22824 SV0 = DAG.getBitcast(ScaleVT, SV0);
22825 SV1 = DAG.getBitcast(ScaleVT, SV1);
22826 return DAG.getBitcast(
22827 VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
22833 // Match shuffles of bitcasts, so long as the mask can be treated as the
22835 if (SDValue V = combineShuffleOfBitcast(SVN, DAG, TLI, LegalOperations))
22838 // Compute the combined shuffle mask for a shuffle with SV0 as the first
22839 // operand, and SV1 as the second operand.
22840 // i.e. Merge SVN(OtherSVN, N1) -> shuffle(SV0, SV1, Mask) iff Commute = false
22841 // Merge SVN(N1, OtherSVN) -> shuffle(SV0, SV1, Mask') iff Commute = true
22842 auto MergeInnerShuffle =
22843 [NumElts, &VT](bool Commute, ShuffleVectorSDNode *SVN,
22844 ShuffleVectorSDNode *OtherSVN, SDValue N1,
22845 const TargetLowering &TLI, SDValue &SV0, SDValue &SV1,
22846 SmallVectorImpl<int> &Mask) -> bool {
22847 // Don't try to fold splats; they're likely to simplify somehow, or they
22849 if (OtherSVN->isSplat())
22852 SV0 = SV1 = SDValue();
22855 for (unsigned i = 0; i != NumElts; ++i) {
22856 int Idx = SVN->getMaskElt(i);
22858 // Propagate Undef.
22859 Mask.push_back(Idx);
22864 Idx = (Idx < (int)NumElts) ? (Idx + NumElts) : (Idx - NumElts);
22866 SDValue CurrentVec;
22867 if (Idx < (int)NumElts) {
22868 // This shuffle index refers to the inner shuffle N0. Lookup the inner
22869 // shuffle mask to identify which vector is actually referenced.
22870 Idx = OtherSVN->getMaskElt(Idx);
22872 // Propagate Undef.
22873 Mask.push_back(Idx);
22876 CurrentVec = (Idx < (int)NumElts) ? OtherSVN->getOperand(0)
22877 : OtherSVN->getOperand(1);
22879 // This shuffle index references an element within N1.
22883 // Simple case where 'CurrentVec' is UNDEF.
22884 if (CurrentVec.isUndef()) {
22885 Mask.push_back(-1);
22889 // Canonicalize the shuffle index. We don't know yet if CurrentVec
22890 // will be the first or second operand of the combined shuffle.
22891 Idx = Idx % NumElts;
22892 if (!SV0.getNode() || SV0 == CurrentVec) {
22893 // Ok. CurrentVec is the left hand side.
22894 // Update the mask accordingly.
22896 Mask.push_back(Idx);
22899 if (!SV1.getNode() || SV1 == CurrentVec) {
22900 // Ok. CurrentVec is the right hand side.
22901 // Update the mask accordingly.
22903 Mask.push_back(Idx + NumElts);
22907 // Last chance - see if the vector is another shuffle and if it
22908 // uses one of the existing candidate shuffle ops.
22909 if (auto *CurrentSVN = dyn_cast<ShuffleVectorSDNode>(CurrentVec)) {
22910 int InnerIdx = CurrentSVN->getMaskElt(Idx);
22911 if (InnerIdx < 0) {
22912 Mask.push_back(-1);
22915 SDValue InnerVec = (InnerIdx < (int)NumElts)
22916 ? CurrentSVN->getOperand(0)
22917 : CurrentSVN->getOperand(1);
22918 if (InnerVec.isUndef()) {
22919 Mask.push_back(-1);
22922 InnerIdx %= NumElts;
22923 if (InnerVec == SV0) {
22924 Mask.push_back(InnerIdx);
22927 if (InnerVec == SV1) {
22928 Mask.push_back(InnerIdx + NumElts);
22933 // Bail out if we cannot convert the shuffle pair into a single shuffle.
22937 if (llvm::all_of(Mask, [](int M) { return M < 0; }))
22940 // Avoid introducing shuffles with illegal mask.
22941 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
22942 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
22943 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
22944 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
22945 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
22946 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
22947 if (TLI.isShuffleMaskLegal(Mask, VT))
22950 std::swap(SV0, SV1);
22951 ShuffleVectorSDNode::commuteMask(Mask);
22952 return TLI.isShuffleMaskLegal(Mask, VT);
22955 if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
22956 // Canonicalize shuffles according to rules:
22957 // shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
22958 // shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
22959 // shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
22960 if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
22961 N0.getOpcode() != ISD::VECTOR_SHUFFLE) {
22962 // The incoming shuffle must be of the same type as the result of the
22963 // current shuffle.
22964 assert(N1->getOperand(0).getValueType() == VT &&
22965 "Shuffle types don't match");
22967 SDValue SV0 = N1->getOperand(0);
22968 SDValue SV1 = N1->getOperand(1);
22969 bool HasSameOp0 = N0 == SV0;
22970 bool IsSV1Undef = SV1.isUndef();
22971 if (HasSameOp0 || IsSV1Undef || N0 == SV1)
22972 // Commute the operands of this shuffle so merging below will trigger.
22973 return DAG.getCommutedVectorShuffle(*SVN);
22976 // Canonicalize splat shuffles to the RHS to improve merging below.
22977 // shuffle(splat(A,u), shuffle(C,D)) -> shuffle'(shuffle(C,D), splat(A,u))
22978 if (N0.getOpcode() == ISD::VECTOR_SHUFFLE &&
22979 N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
22980 cast<ShuffleVectorSDNode>(N0)->isSplat() &&
22981 !cast<ShuffleVectorSDNode>(N1)->isSplat()) {
22982 return DAG.getCommutedVectorShuffle(*SVN);
22985 // Try to fold according to rules:
22986 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
22987 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
22988 // shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
22989 // Don't try to fold shuffles with illegal type.
22990 // Only fold if this shuffle is the only user of the other shuffle.
22991 // Try matching shuffle(C,shuffle(A,B)) commutted patterns as well.
22992 for (int i = 0; i != 2; ++i) {
22993 if (N->getOperand(i).getOpcode() == ISD::VECTOR_SHUFFLE &&
22994 N->isOnlyUserOf(N->getOperand(i).getNode())) {
22995 // The incoming shuffle must be of the same type as the result of the
22996 // current shuffle.
22997 auto *OtherSV = cast<ShuffleVectorSDNode>(N->getOperand(i));
22998 assert(OtherSV->getOperand(0).getValueType() == VT &&
22999 "Shuffle types don't match");
23002 SmallVector<int, 4> Mask;
23003 if (MergeInnerShuffle(i != 0, SVN, OtherSV, N->getOperand(1 - i), TLI,
23005 // Check if all indices in Mask are Undef. In case, propagate Undef.
23006 if (llvm::all_of(Mask, [](int M) { return M < 0; }))
23007 return DAG.getUNDEF(VT);
23009 return DAG.getVectorShuffle(VT, SDLoc(N),
23010 SV0 ? SV0 : DAG.getUNDEF(VT),
23011 SV1 ? SV1 : DAG.getUNDEF(VT), Mask);
23016 // Merge shuffles through binops if we are able to merge it with at least
23017 // one other shuffles.
23018 // shuffle(bop(shuffle(x,y),shuffle(z,w)),undef)
23019 // shuffle(bop(shuffle(x,y),shuffle(z,w)),bop(shuffle(a,b),shuffle(c,d)))
23020 unsigned SrcOpcode = N0.getOpcode();
23021 if (TLI.isBinOp(SrcOpcode) && N->isOnlyUserOf(N0.getNode()) &&
23023 (SrcOpcode == N1.getOpcode() && N->isOnlyUserOf(N1.getNode())))) {
23024 // Get binop source ops, or just pass on the undef.
23025 SDValue Op00 = N0.getOperand(0);
23026 SDValue Op01 = N0.getOperand(1);
23027 SDValue Op10 = N1.isUndef() ? N1 : N1.getOperand(0);
23028 SDValue Op11 = N1.isUndef() ? N1 : N1.getOperand(1);
23029 // TODO: We might be able to relax the VT check but we don't currently
23030 // have any isBinOp() that has different result/ops VTs so play safe until
23031 // we have test coverage.
23032 if (Op00.getValueType() == VT && Op10.getValueType() == VT &&
23033 Op01.getValueType() == VT && Op11.getValueType() == VT &&
23034 (Op00.getOpcode() == ISD::VECTOR_SHUFFLE ||
23035 Op10.getOpcode() == ISD::VECTOR_SHUFFLE ||
23036 Op01.getOpcode() == ISD::VECTOR_SHUFFLE ||
23037 Op11.getOpcode() == ISD::VECTOR_SHUFFLE)) {
23038 auto CanMergeInnerShuffle = [&](SDValue &SV0, SDValue &SV1,
23039 SmallVectorImpl<int> &Mask, bool LeftOp,
23041 SDValue InnerN = Commute ? N1 : N0;
23042 SDValue Op0 = LeftOp ? Op00 : Op01;
23043 SDValue Op1 = LeftOp ? Op10 : Op11;
23045 std::swap(Op0, Op1);
23046 // Only accept the merged shuffle if we don't introduce undef elements,
23047 // or the inner shuffle already contained undef elements.
23048 auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(Op0);
23049 return SVN0 && InnerN->isOnlyUserOf(SVN0) &&
23050 MergeInnerShuffle(Commute, SVN, SVN0, Op1, TLI, SV0, SV1,
23052 (llvm::any_of(SVN0->getMask(), [](int M) { return M < 0; }) ||
23053 llvm::none_of(Mask, [](int M) { return M < 0; }));
23056 // Ensure we don't increase the number of shuffles - we must merge a
23057 // shuffle from at least one of the LHS and RHS ops.
23058 bool MergedLeft = false;
23059 SDValue LeftSV0, LeftSV1;
23060 SmallVector<int, 4> LeftMask;
23061 if (CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, false) ||
23062 CanMergeInnerShuffle(LeftSV0, LeftSV1, LeftMask, true, true)) {
23065 LeftMask.assign(SVN->getMask().begin(), SVN->getMask().end());
23066 LeftSV0 = Op00, LeftSV1 = Op10;
23069 bool MergedRight = false;
23070 SDValue RightSV0, RightSV1;
23071 SmallVector<int, 4> RightMask;
23072 if (CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, false) ||
23073 CanMergeInnerShuffle(RightSV0, RightSV1, RightMask, false, true)) {
23074 MergedRight = true;
23076 RightMask.assign(SVN->getMask().begin(), SVN->getMask().end());
23077 RightSV0 = Op01, RightSV1 = Op11;
23080 if (MergedLeft || MergedRight) {
23082 SDValue LHS = DAG.getVectorShuffle(
23083 VT, DL, LeftSV0 ? LeftSV0 : DAG.getUNDEF(VT),
23084 LeftSV1 ? LeftSV1 : DAG.getUNDEF(VT), LeftMask);
23085 SDValue RHS = DAG.getVectorShuffle(
23086 VT, DL, RightSV0 ? RightSV0 : DAG.getUNDEF(VT),
23087 RightSV1 ? RightSV1 : DAG.getUNDEF(VT), RightMask);
23088 return DAG.getNode(SrcOpcode, DL, VT, LHS, RHS);
23094 if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
23100 SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
23101 SDValue InVal = N->getOperand(0);
23102 EVT VT = N->getValueType(0);
23104 // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
23105 // with a VECTOR_SHUFFLE and possible truncate.
23106 if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
23107 VT.isFixedLengthVector() &&
23108 InVal->getOperand(0).getValueType().isFixedLengthVector()) {
23109 SDValue InVec = InVal->getOperand(0);
23110 SDValue EltNo = InVal->getOperand(1);
23111 auto InVecT = InVec.getValueType();
23112 if (ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo)) {
23113 SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
23114 int Elt = C0->getZExtValue();
23116 // If we have an implict truncate do truncate here as long as it's legal.
23117 // if it's not legal, this should
23118 if (VT.getScalarType() != InVal.getValueType() &&
23119 InVal.getValueType().isScalarInteger() &&
23120 isTypeLegal(VT.getScalarType())) {
23122 DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
23123 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
23125 if (VT.getScalarType() == InVecT.getScalarType() &&
23126 VT.getVectorNumElements() <= InVecT.getVectorNumElements()) {
23127 SDValue LegalShuffle =
23128 TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec,
23129 DAG.getUNDEF(InVecT), NewMask, DAG);
23130 if (LegalShuffle) {
23131 // If the initial vector is the correct size this shuffle is a
23134 return LegalShuffle;
23135 // If not we must truncate the vector.
23136 if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
23137 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N));
23138 EVT SubVT = EVT::getVectorVT(*DAG.getContext(),
23139 InVecT.getVectorElementType(),
23140 VT.getVectorNumElements());
23141 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT,
23142 LegalShuffle, ZeroIdx);
23152 SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
23153 EVT VT = N->getValueType(0);
23154 SDValue N0 = N->getOperand(0);
23155 SDValue N1 = N->getOperand(1);
23156 SDValue N2 = N->getOperand(2);
23157 uint64_t InsIdx = N->getConstantOperandVal(2);
23159 // If inserting an UNDEF, just return the original vector.
23163 // If this is an insert of an extracted vector into an undef vector, we can
23164 // just use the input to the extract.
23165 if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23166 N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
23167 return N1.getOperand(0);
23169 // Simplify scalar inserts into an undef vector:
23170 // insert_subvector undef, (splat X), N2 -> splat X
23171 if (N0.isUndef() && N1.getOpcode() == ISD::SPLAT_VECTOR)
23172 return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, N1.getOperand(0));
23174 // If we are inserting a bitcast value into an undef, with the same
23175 // number of elements, just use the bitcast input of the extract.
23176 // i.e. INSERT_SUBVECTOR UNDEF (BITCAST N1) N2 ->
23177 // BITCAST (INSERT_SUBVECTOR UNDEF N1 N2)
23178 if (N0.isUndef() && N1.getOpcode() == ISD::BITCAST &&
23179 N1.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR &&
23180 N1.getOperand(0).getOperand(1) == N2 &&
23181 N1.getOperand(0).getOperand(0).getValueType().getVectorElementCount() ==
23182 VT.getVectorElementCount() &&
23183 N1.getOperand(0).getOperand(0).getValueType().getSizeInBits() ==
23184 VT.getSizeInBits()) {
23185 return DAG.getBitcast(VT, N1.getOperand(0).getOperand(0));
23188 // If both N1 and N2 are bitcast values on which insert_subvector
23189 // would makes sense, pull the bitcast through.
23190 // i.e. INSERT_SUBVECTOR (BITCAST N0) (BITCAST N1) N2 ->
23191 // BITCAST (INSERT_SUBVECTOR N0 N1 N2)
23192 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) {
23193 SDValue CN0 = N0.getOperand(0);
23194 SDValue CN1 = N1.getOperand(0);
23195 EVT CN0VT = CN0.getValueType();
23196 EVT CN1VT = CN1.getValueType();
23197 if (CN0VT.isVector() && CN1VT.isVector() &&
23198 CN0VT.getVectorElementType() == CN1VT.getVectorElementType() &&
23199 CN0VT.getVectorElementCount() == VT.getVectorElementCount()) {
23200 SDValue NewINSERT = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N),
23201 CN0.getValueType(), CN0, CN1, N2);
23202 return DAG.getBitcast(VT, NewINSERT);
23206 // Combine INSERT_SUBVECTORs where we are inserting to the same index.
23207 // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
23208 // --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
23209 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
23210 N0.getOperand(1).getValueType() == N1.getValueType() &&
23211 N0.getOperand(2) == N2)
23212 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
23215 // Eliminate an intermediate insert into an undef vector:
23216 // insert_subvector undef, (insert_subvector undef, X, 0), N2 -->
23217 // insert_subvector undef, X, N2
23218 if (N0.isUndef() && N1.getOpcode() == ISD::INSERT_SUBVECTOR &&
23219 N1.getOperand(0).isUndef() && isNullConstant(N1.getOperand(2)))
23220 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0,
23221 N1.getOperand(1), N2);
23223 // Push subvector bitcasts to the output, adjusting the index as we go.
23224 // insert_subvector(bitcast(v), bitcast(s), c1)
23225 // -> bitcast(insert_subvector(v, s, c2))
23226 if ((N0.isUndef() || N0.getOpcode() == ISD::BITCAST) &&
23227 N1.getOpcode() == ISD::BITCAST) {
23228 SDValue N0Src = peekThroughBitcasts(N0);
23229 SDValue N1Src = peekThroughBitcasts(N1);
23230 EVT N0SrcSVT = N0Src.getValueType().getScalarType();
23231 EVT N1SrcSVT = N1Src.getValueType().getScalarType();
23232 if ((N0.isUndef() || N0SrcSVT == N1SrcSVT) &&
23233 N0Src.getValueType().isVector() && N1Src.getValueType().isVector()) {
23237 LLVMContext &Ctx = *DAG.getContext();
23238 ElementCount NumElts = VT.getVectorElementCount();
23239 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23240 if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {
23241 unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();
23242 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale);
23243 NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);
23244 } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {
23245 unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;
23246 if (NumElts.isKnownMultipleOf(Scale) && (InsIdx % Scale) == 0) {
23247 NewVT = EVT::getVectorVT(Ctx, N1SrcSVT,
23248 NumElts.divideCoefficientBy(Scale));
23249 NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);
23252 if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) {
23253 SDValue Res = DAG.getBitcast(NewVT, N0Src);
23254 Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, NewVT, Res, N1Src, NewIdx);
23255 return DAG.getBitcast(VT, Res);
23260 // Canonicalize insert_subvector dag nodes.
23262 // (insert_subvector (insert_subvector A, Idx0), Idx1)
23263 // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
23264 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
23265 N1.getValueType() == N0.getOperand(1).getValueType()) {
23266 unsigned OtherIdx = N0.getConstantOperandVal(2);
23267 if (InsIdx < OtherIdx) {
23269 SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
23270 N0.getOperand(0), N1, N2);
23271 AddToWorklist(NewOp.getNode());
23272 return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
23273 VT, NewOp, N0.getOperand(1), N0.getOperand(2));
23277 // If the input vector is a concatenation, and the insert replaces
23278 // one of the pieces, we can optimize into a single concat_vectors.
23279 if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
23280 N0.getOperand(0).getValueType() == N1.getValueType() &&
23281 N0.getOperand(0).getValueType().isScalableVector() ==
23282 N1.getValueType().isScalableVector()) {
23283 unsigned Factor = N1.getValueType().getVectorMinNumElements();
23284 SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
23285 Ops[InsIdx / Factor] = N1;
23286 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
23289 // Simplify source operands based on insertion.
23290 if (SimplifyDemandedVectorElts(SDValue(N, 0)))
23291 return SDValue(N, 0);
23296 SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
23297 SDValue N0 = N->getOperand(0);
23299 // fold (fp_to_fp16 (fp16_to_fp op)) -> op
23300 if (N0->getOpcode() == ISD::FP16_TO_FP)
23301 return N0->getOperand(0);
23306 SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
23307 SDValue N0 = N->getOperand(0);
23309 // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
23310 if (!TLI.shouldKeepZExtForFP16Conv() && N0->getOpcode() == ISD::AND) {
23311 ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
23312 if (AndConst && AndConst->getAPIntValue() == 0xffff) {
23313 return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
23321 SDValue DAGCombiner::visitFP_TO_BF16(SDNode *N) {
23322 SDValue N0 = N->getOperand(0);
23324 // fold (fp_to_bf16 (bf16_to_fp op)) -> op
23325 if (N0->getOpcode() == ISD::BF16_TO_FP)
23326 return N0->getOperand(0);
23331 SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
23332 SDValue N0 = N->getOperand(0);
23333 EVT VT = N0.getValueType();
23334 unsigned Opcode = N->getOpcode();
23336 // VECREDUCE over 1-element vector is just an extract.
23337 if (VT.getVectorElementCount().isScalar()) {
23340 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
23341 DAG.getVectorIdxConstant(0, dl));
23342 if (Res.getValueType() != N->getValueType(0))
23343 Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res);
23347 // On an boolean vector an and/or reduction is the same as a umin/umax
23348 // reduction. Convert them if the latter is legal while the former isn't.
23349 if (Opcode == ISD::VECREDUCE_AND || Opcode == ISD::VECREDUCE_OR) {
23350 unsigned NewOpcode = Opcode == ISD::VECREDUCE_AND
23351 ? ISD::VECREDUCE_UMIN : ISD::VECREDUCE_UMAX;
23352 if (!TLI.isOperationLegalOrCustom(Opcode, VT) &&
23353 TLI.isOperationLegalOrCustom(NewOpcode, VT) &&
23354 DAG.ComputeNumSignBits(N0) == VT.getScalarSizeInBits())
23355 return DAG.getNode(NewOpcode, SDLoc(N), N->getValueType(0), N0);
23358 // vecreduce_or(insert_subvector(zero or undef, val)) -> vecreduce_or(val)
23359 // vecreduce_and(insert_subvector(ones or undef, val)) -> vecreduce_and(val)
23360 if (N0.getOpcode() == ISD::INSERT_SUBVECTOR &&
23361 TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
23362 SDValue Vec = N0.getOperand(0);
23363 SDValue Subvec = N0.getOperand(1);
23364 if ((Opcode == ISD::VECREDUCE_OR &&
23365 (N0.getOperand(0).isUndef() || isNullOrNullSplat(Vec))) ||
23366 (Opcode == ISD::VECREDUCE_AND &&
23367 (N0.getOperand(0).isUndef() || isAllOnesOrAllOnesSplat(Vec))))
23368 return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), Subvec);
23374 SDValue DAGCombiner::visitVPOp(SDNode *N) {
23375 // VP operations in which all vector elements are disabled - either by
23376 // determining that the mask is all false or that the EVL is 0 - can be
23378 bool AreAllEltsDisabled = false;
23379 if (auto EVLIdx = ISD::getVPExplicitVectorLengthIdx(N->getOpcode()))
23380 AreAllEltsDisabled |= isNullConstant(N->getOperand(*EVLIdx));
23381 if (auto MaskIdx = ISD::getVPMaskIdx(N->getOpcode()))
23382 AreAllEltsDisabled |=
23383 ISD::isConstantSplatVectorAllZeros(N->getOperand(*MaskIdx).getNode());
23385 // This is the only generic VP combine we support for now.
23386 if (!AreAllEltsDisabled)
23389 // Binary operations can be replaced by UNDEF.
23390 if (ISD::isVPBinaryOp(N->getOpcode()))
23391 return DAG.getUNDEF(N->getValueType(0));
23393 // VP Memory operations can be replaced by either the chain (stores) or the
23394 // chain + undef (loads).
23395 if (const auto *MemSD = dyn_cast<MemSDNode>(N)) {
23396 if (MemSD->writeMem())
23397 return MemSD->getChain();
23398 return CombineTo(N, DAG.getUNDEF(N->getValueType(0)), MemSD->getChain());
23401 // Reduction operations return the start operand when no elements are active.
23402 if (ISD::isVPReduction(N->getOpcode()))
23403 return N->getOperand(0);
23408 /// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
23409 /// with the destination vector and a zero vector.
23410 /// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
23411 /// vector_shuffle V, Zero, <0, 4, 2, 4>
23412 SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
23413 assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
23415 EVT VT = N->getValueType(0);
23416 SDValue LHS = N->getOperand(0);
23417 SDValue RHS = peekThroughBitcasts(N->getOperand(1));
23420 // Make sure we're not running after operation legalization where it
23421 // may have custom lowered the vector shuffles.
23422 if (LegalOperations)
23425 if (RHS.getOpcode() != ISD::BUILD_VECTOR)
23428 EVT RVT = RHS.getValueType();
23429 unsigned NumElts = RHS.getNumOperands();
23431 // Attempt to create a valid clear mask, splitting the mask into
23432 // sub elements and checking to see if each is
23433 // all zeros or all ones - suitable for shuffle masking.
23434 auto BuildClearMask = [&](int Split) {
23435 int NumSubElts = NumElts * Split;
23436 int NumSubBits = RVT.getScalarSizeInBits() / Split;
23438 SmallVector<int, 8> Indices;
23439 for (int i = 0; i != NumSubElts; ++i) {
23440 int EltIdx = i / Split;
23441 int SubIdx = i % Split;
23442 SDValue Elt = RHS.getOperand(EltIdx);
23443 // X & undef --> 0 (not undef). So this lane must be converted to choose
23444 // from the zero constant vector (same as if the element had all 0-bits).
23445 if (Elt.isUndef()) {
23446 Indices.push_back(i + NumSubElts);
23451 if (isa<ConstantSDNode>(Elt))
23452 Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
23453 else if (isa<ConstantFPSDNode>(Elt))
23454 Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
23458 // Extract the sub element from the constant bit mask.
23459 if (DAG.getDataLayout().isBigEndian())
23460 Bits = Bits.extractBits(NumSubBits, (Split - SubIdx - 1) * NumSubBits);
23462 Bits = Bits.extractBits(NumSubBits, SubIdx * NumSubBits);
23464 if (Bits.isAllOnes())
23465 Indices.push_back(i);
23466 else if (Bits == 0)
23467 Indices.push_back(i + NumSubElts);
23472 // Let's see if the target supports this vector_shuffle.
23473 EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
23474 EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
23475 if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
23478 SDValue Zero = DAG.getConstant(0, DL, ClearVT);
23479 return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, DL,
23480 DAG.getBitcast(ClearVT, LHS),
23484 // Determine maximum split level (byte level masking).
23486 if (RVT.getScalarSizeInBits() % 8 == 0)
23487 MaxSplit = RVT.getScalarSizeInBits() / 8;
23489 for (int Split = 1; Split <= MaxSplit; ++Split)
23490 if (RVT.getScalarSizeInBits() % Split == 0)
23491 if (SDValue S = BuildClearMask(Split))
23497 /// If a vector binop is performed on splat values, it may be profitable to
23498 /// extract, scalarize, and insert/splat.
23499 static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG,
23501 SDValue N0 = N->getOperand(0);
23502 SDValue N1 = N->getOperand(1);
23503 unsigned Opcode = N->getOpcode();
23504 EVT VT = N->getValueType(0);
23505 EVT EltVT = VT.getVectorElementType();
23506 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23508 // TODO: Remove/replace the extract cost check? If the elements are available
23509 // as scalars, then there may be no extract cost. Should we ask if
23510 // inserting a scalar back into a vector is cheap instead?
23511 int Index0, Index1;
23512 SDValue Src0 = DAG.getSplatSourceVector(N0, Index0);
23513 SDValue Src1 = DAG.getSplatSourceVector(N1, Index1);
23514 // Extract element from splat_vector should be free.
23515 // TODO: use DAG.isSplatValue instead?
23516 bool IsBothSplatVector = N0.getOpcode() == ISD::SPLAT_VECTOR &&
23517 N1.getOpcode() == ISD::SPLAT_VECTOR;
23518 if (!Src0 || !Src1 || Index0 != Index1 ||
23519 Src0.getValueType().getVectorElementType() != EltVT ||
23520 Src1.getValueType().getVectorElementType() != EltVT ||
23521 !(IsBothSplatVector || TLI.isExtractVecEltCheap(VT, Index0)) ||
23522 !TLI.isOperationLegalOrCustom(Opcode, EltVT))
23525 SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
23526 SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
23527 SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
23528 SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
23530 // If all lanes but 1 are undefined, no need to splat the scalar result.
23531 // TODO: Keep track of undefs and use that info in the general case.
23532 if (N0.getOpcode() == ISD::BUILD_VECTOR && N0.getOpcode() == N1.getOpcode() &&
23533 count_if(N0->ops(), [](SDValue V) { return !V.isUndef(); }) == 1 &&
23534 count_if(N1->ops(), [](SDValue V) { return !V.isUndef(); }) == 1) {
23535 // bo (build_vec ..undef, X, undef...), (build_vec ..undef, Y, undef...) -->
23536 // build_vec ..undef, (bo X, Y), undef...
23537 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), DAG.getUNDEF(EltVT));
23538 Ops[Index0] = ScalarBO;
23539 return DAG.getBuildVector(VT, DL, Ops);
23542 // bo (splat X, Index), (splat Y, Index) --> splat (bo X, Y), Index
23543 if (VT.isScalableVector())
23544 return DAG.getSplatVector(VT, DL, ScalarBO);
23545 SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), ScalarBO);
23546 return DAG.getBuildVector(VT, DL, Ops);
23549 /// Visit a binary vector operation, like ADD.
23550 SDValue DAGCombiner::SimplifyVBinOp(SDNode *N, const SDLoc &DL) {
23551 EVT VT = N->getValueType(0);
23552 assert(VT.isVector() && "SimplifyVBinOp only works on vectors!");
23554 SDValue LHS = N->getOperand(0);
23555 SDValue RHS = N->getOperand(1);
23556 unsigned Opcode = N->getOpcode();
23557 SDNodeFlags Flags = N->getFlags();
23559 // Move unary shuffles with identical masks after a vector binop:
23560 // VBinOp (shuffle A, Undef, Mask), (shuffle B, Undef, Mask))
23561 // --> shuffle (VBinOp A, B), Undef, Mask
23562 // This does not require type legality checks because we are creating the
23563 // same types of operations that are in the original sequence. We do have to
23564 // restrict ops like integer div that have immediate UB (eg, div-by-zero)
23565 // though. This code is adapted from the identical transform in instcombine.
23566 if (Opcode != ISD::UDIV && Opcode != ISD::SDIV &&
23567 Opcode != ISD::UREM && Opcode != ISD::SREM &&
23568 Opcode != ISD::UDIVREM && Opcode != ISD::SDIVREM) {
23569 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
23570 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
23571 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
23572 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
23573 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
23574 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
23575 RHS.getOperand(0), Flags);
23576 SDValue UndefV = LHS.getOperand(1);
23577 return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
23580 // Try to sink a splat shuffle after a binop with a uniform constant.
23581 // This is limited to cases where neither the shuffle nor the constant have
23582 // undefined elements because that could be poison-unsafe or inhibit
23583 // demanded elements analysis. It is further limited to not change a splat
23584 // of an inserted scalar because that may be optimized better by
23585 // load-folding or other target-specific behaviors.
23586 if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) &&
23587 Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
23588 Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
23589 // binop (splat X), (splat C) --> splat (binop X, C)
23590 SDValue X = Shuf0->getOperand(0);
23591 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
23592 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
23595 if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) &&
23596 Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
23597 Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
23598 // binop (splat C), (splat X) --> splat (binop C, X)
23599 SDValue X = Shuf1->getOperand(0);
23600 SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
23601 return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
23606 // The following pattern is likely to emerge with vector reduction ops. Moving
23607 // the binary operation ahead of insertion may allow using a narrower vector
23608 // instruction that has better performance than the wide version of the op:
23609 // VBinOp (ins undef, X, Z), (ins undef, Y, Z) --> ins VecC, (VBinOp X, Y), Z
23610 if (LHS.getOpcode() == ISD::INSERT_SUBVECTOR && LHS.getOperand(0).isUndef() &&
23611 RHS.getOpcode() == ISD::INSERT_SUBVECTOR && RHS.getOperand(0).isUndef() &&
23612 LHS.getOperand(2) == RHS.getOperand(2) &&
23613 (LHS.hasOneUse() || RHS.hasOneUse())) {
23614 SDValue X = LHS.getOperand(1);
23615 SDValue Y = RHS.getOperand(1);
23616 SDValue Z = LHS.getOperand(2);
23617 EVT NarrowVT = X.getValueType();
23618 if (NarrowVT == Y.getValueType() &&
23619 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT,
23620 LegalOperations)) {
23621 // (binop undef, undef) may not return undef, so compute that result.
23623 DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));
23624 SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
23625 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
23629 // Make sure all but the first op are undef or constant.
23630 auto ConcatWithConstantOrUndef = [](SDValue Concat) {
23631 return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
23632 all_of(drop_begin(Concat->ops()), [](const SDValue &Op) {
23633 return Op.isUndef() ||
23634 ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
23638 // The following pattern is likely to emerge with vector reduction ops. Moving
23639 // the binary operation ahead of the concat may allow using a narrower vector
23640 // instruction that has better performance than the wide version of the op:
23641 // VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
23642 // concat (VBinOp X, Y), VecC
23643 if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
23644 (LHS.hasOneUse() || RHS.hasOneUse())) {
23645 EVT NarrowVT = LHS.getOperand(0).getValueType();
23646 if (NarrowVT == RHS.getOperand(0).getValueType() &&
23647 TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
23648 unsigned NumOperands = LHS.getNumOperands();
23649 SmallVector<SDValue, 4> ConcatOps;
23650 for (unsigned i = 0; i != NumOperands; ++i) {
23651 // This constant fold for operands 1 and up.
23652 ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
23653 RHS.getOperand(i)));
23656 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
23660 if (SDValue V = scalarizeBinOpOfSplats(N, DAG, DL))
23666 SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
23668 assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
23670 SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
23671 cast<CondCodeSDNode>(N0.getOperand(2))->get());
23673 // If we got a simplified select_cc node back from SimplifySelectCC, then
23674 // break it down into a new SETCC node, and a new SELECT node, and then return
23675 // the SELECT node, since we were called with a SELECT node.
23676 if (SCC.getNode()) {
23677 // Check to see if we got a select_cc back (to turn into setcc/select).
23678 // Otherwise, just return whatever node we got back, like fabs.
23679 if (SCC.getOpcode() == ISD::SELECT_CC) {
23680 const SDNodeFlags Flags = N0->getFlags();
23681 SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
23683 SCC.getOperand(0), SCC.getOperand(1),
23684 SCC.getOperand(4), Flags);
23685 AddToWorklist(SETCC.getNode());
23686 SDValue SelectNode = DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
23687 SCC.getOperand(2), SCC.getOperand(3));
23688 SelectNode->setFlags(Flags);
23697 /// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
23698 /// being selected between, see if we can simplify the select. Callers of this
23699 /// should assume that TheSelect is deleted if this returns true. As such, they
23700 /// should return the appropriate thing (e.g. the node) back to the top-level of
23701 /// the DAG combiner loop to avoid it being looked at.
23702 bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
23704 // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
23705 // The select + setcc is redundant, because fsqrt returns NaN for X < 0.
23706 if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
23707 if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
23708 // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
23709 SDValue Sqrt = RHS;
23712 const ConstantFPSDNode *Zero = nullptr;
23714 if (TheSelect->getOpcode() == ISD::SELECT_CC) {
23715 CC = cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
23716 CmpLHS = TheSelect->getOperand(0);
23717 Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
23719 // SELECT or VSELECT
23720 SDValue Cmp = TheSelect->getOperand(0);
23721 if (Cmp.getOpcode() == ISD::SETCC) {
23722 CC = cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
23723 CmpLHS = Cmp.getOperand(0);
23724 Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
23727 if (Zero && Zero->isZero() &&
23728 Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT ||
23729 CC == ISD::SETULT || CC == ISD::SETLT)) {
23730 // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
23731 CombineTo(TheSelect, Sqrt);
23736 // Cannot simplify select with vector condition
23737 if (TheSelect->getOperand(0).getValueType().isVector()) return false;
23739 // If this is a select from two identical things, try to pull the operation
23740 // through the select.
23741 if (LHS.getOpcode() != RHS.getOpcode() ||
23742 !LHS.hasOneUse() || !RHS.hasOneUse())
23745 // If this is a load and the token chain is identical, replace the select
23746 // of two loads with a load through a select of the address to load from.
23747 // This triggers in things like "select bool X, 10.0, 123.0" after the FP
23748 // constants have been dropped into the constant pool.
23749 if (LHS.getOpcode() == ISD::LOAD) {
23750 LoadSDNode *LLD = cast<LoadSDNode>(LHS);
23751 LoadSDNode *RLD = cast<LoadSDNode>(RHS);
23753 // Token chains must be identical.
23754 if (LHS.getOperand(0) != RHS.getOperand(0) ||
23755 // Do not let this transformation reduce the number of volatile loads.
23756 // Be conservative for atomics for the moment
23757 // TODO: This does appear to be legal for unordered atomics (see D66309)
23758 !LLD->isSimple() || !RLD->isSimple() ||
23759 // FIXME: If either is a pre/post inc/dec load,
23760 // we'd need to split out the address adjustment.
23761 LLD->isIndexed() || RLD->isIndexed() ||
23762 // If this is an EXTLOAD, the VT's must match.
23763 LLD->getMemoryVT() != RLD->getMemoryVT() ||
23764 // If this is an EXTLOAD, the kind of extension must match.
23765 (LLD->getExtensionType() != RLD->getExtensionType() &&
23766 // The only exception is if one of the extensions is anyext.
23767 LLD->getExtensionType() != ISD::EXTLOAD &&
23768 RLD->getExtensionType() != ISD::EXTLOAD) ||
23769 // FIXME: this discards src value information. This is
23770 // over-conservative. It would be beneficial to be able to remember
23771 // both potential memory locations. Since we are discarding
23772 // src value info, don't do the transformation if the memory
23773 // locations are not in the default address space.
23774 LLD->getPointerInfo().getAddrSpace() != 0 ||
23775 RLD->getPointerInfo().getAddrSpace() != 0 ||
23776 // We can't produce a CMOV of a TargetFrameIndex since we won't
23777 // generate the address generation required.
23778 LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
23779 RLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
23780 !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
23781 LLD->getBasePtr().getValueType()))
23784 // The loads must not depend on one another.
23785 if (LLD->isPredecessorOf(RLD) || RLD->isPredecessorOf(LLD))
23788 // Check that the select condition doesn't reach either load. If so,
23789 // folding this will induce a cycle into the DAG. If not, this is safe to
23790 // xform, so create a select of the addresses.
23792 SmallPtrSet<const SDNode *, 32> Visited;
23793 SmallVector<const SDNode *, 16> Worklist;
23795 // Always fail if LLD and RLD are not independent. TheSelect is a
23796 // predecessor to all Nodes in question so we need not search past it.
23798 Visited.insert(TheSelect);
23799 Worklist.push_back(LLD);
23800 Worklist.push_back(RLD);
23802 if (SDNode::hasPredecessorHelper(LLD, Visited, Worklist) ||
23803 SDNode::hasPredecessorHelper(RLD, Visited, Worklist))
23807 if (TheSelect->getOpcode() == ISD::SELECT) {
23808 // We cannot do this optimization if any pair of {RLD, LLD} is a
23809 // predecessor to {RLD, LLD, CondNode}. As we've already compared the
23810 // Loads, we only need to check if CondNode is a successor to one of the
23811 // loads. We can further avoid this if there's no use of their chain
23813 SDNode *CondNode = TheSelect->getOperand(0).getNode();
23814 Worklist.push_back(CondNode);
23816 if ((LLD->hasAnyUseOfValue(1) &&
23817 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
23818 (RLD->hasAnyUseOfValue(1) &&
23819 SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
23822 Addr = DAG.getSelect(SDLoc(TheSelect),
23823 LLD->getBasePtr().getValueType(),
23824 TheSelect->getOperand(0), LLD->getBasePtr(),
23825 RLD->getBasePtr());
23826 } else { // Otherwise SELECT_CC
23827 // We cannot do this optimization if any pair of {RLD, LLD} is a
23828 // predecessor to {RLD, LLD, CondLHS, CondRHS}. As we've already compared
23829 // the Loads, we only need to check if CondLHS/CondRHS is a successor to
23830 // one of the loads. We can further avoid this if there's no use of their
23833 SDNode *CondLHS = TheSelect->getOperand(0).getNode();
23834 SDNode *CondRHS = TheSelect->getOperand(1).getNode();
23835 Worklist.push_back(CondLHS);
23836 Worklist.push_back(CondRHS);
23838 if ((LLD->hasAnyUseOfValue(1) &&
23839 SDNode::hasPredecessorHelper(LLD, Visited, Worklist)) ||
23840 (RLD->hasAnyUseOfValue(1) &&
23841 SDNode::hasPredecessorHelper(RLD, Visited, Worklist)))
23844 Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
23845 LLD->getBasePtr().getValueType(),
23846 TheSelect->getOperand(0),
23847 TheSelect->getOperand(1),
23848 LLD->getBasePtr(), RLD->getBasePtr(),
23849 TheSelect->getOperand(4));
23853 // It is safe to replace the two loads if they have different alignments,
23854 // but the new load must be the minimum (most restrictive) alignment of the
23856 Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
23857 MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
23858 if (!RLD->isInvariant())
23859 MMOFlags &= ~MachineMemOperand::MOInvariant;
23860 if (!RLD->isDereferenceable())
23861 MMOFlags &= ~MachineMemOperand::MODereferenceable;
23862 if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
23863 // FIXME: Discards pointer and AA info.
23864 Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
23865 LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
23868 // FIXME: Discards pointer and AA info.
23869 Load = DAG.getExtLoad(
23870 LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
23871 : LLD->getExtensionType(),
23872 SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
23873 MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
23876 // Users of the select now use the result of the load.
23877 CombineTo(TheSelect, Load);
23879 // Users of the old loads now use the new load's chain. We know the
23880 // old-load value is dead now.
23881 CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
23882 CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
23889 /// Try to fold an expression of the form (N0 cond N1) ? N2 : N3 to a shift and
23891 SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
23892 SDValue N1, SDValue N2, SDValue N3,
23893 ISD::CondCode CC) {
23894 // If this is a select where the false operand is zero and the compare is a
23895 // check of the sign bit, see if we can perform the "gzip trick":
23896 // select_cc setlt X, 0, A, 0 -> and (sra X, size(X)-1), A
23897 // select_cc setgt X, 0, A, 0 -> and (not (sra X, size(X)-1)), A
23898 EVT XType = N0.getValueType();
23899 EVT AType = N2.getValueType();
23900 if (!isNullConstant(N3) || !XType.bitsGE(AType))
23903 // If the comparison is testing for a positive value, we have to invert
23904 // the sign bit mask, so only do that transform if the target has a bitwise
23905 // 'and not' instruction (the invert is free).
23906 if (CC == ISD::SETGT && TLI.hasAndNot(N2)) {
23907 // (X > -1) ? A : 0
23908 // (X > 0) ? X : 0 <-- This is canonical signed max.
23909 if (!(isAllOnesConstant(N1) || (isNullConstant(N1) && N0 == N2)))
23911 } else if (CC == ISD::SETLT) {
23913 // (X < 1) ? X : 0 <-- This is un-canonicalized signed min.
23914 if (!(isNullConstant(N1) || (isOneConstant(N1) && N0 == N2)))
23920 // and (sra X, size(X)-1), A -> "and (srl X, C2), A" iff A is a single-bit
23922 EVT ShiftAmtTy = getShiftAmountTy(N0.getValueType());
23923 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
23924 if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
23925 unsigned ShCt = XType.getSizeInBits() - N2C->getAPIntValue().logBase2() - 1;
23926 if (!TLI.shouldAvoidTransformToShift(XType, ShCt)) {
23927 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
23928 SDValue Shift = DAG.getNode(ISD::SRL, DL, XType, N0, ShiftAmt);
23929 AddToWorklist(Shift.getNode());
23931 if (XType.bitsGT(AType)) {
23932 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
23933 AddToWorklist(Shift.getNode());
23936 if (CC == ISD::SETGT)
23937 Shift = DAG.getNOT(DL, Shift, AType);
23939 return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
23943 unsigned ShCt = XType.getSizeInBits() - 1;
23944 if (TLI.shouldAvoidTransformToShift(XType, ShCt))
23947 SDValue ShiftAmt = DAG.getConstant(ShCt, DL, ShiftAmtTy);
23948 SDValue Shift = DAG.getNode(ISD::SRA, DL, XType, N0, ShiftAmt);
23949 AddToWorklist(Shift.getNode());
23951 if (XType.bitsGT(AType)) {
23952 Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
23953 AddToWorklist(Shift.getNode());
23956 if (CC == ISD::SETGT)
23957 Shift = DAG.getNOT(DL, Shift, AType);
23959 return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
23962 // Fold select(cc, binop(), binop()) -> binop(select(), select()) etc.
23963 SDValue DAGCombiner::foldSelectOfBinops(SDNode *N) {
23964 SDValue N0 = N->getOperand(0);
23965 SDValue N1 = N->getOperand(1);
23966 SDValue N2 = N->getOperand(2);
23967 EVT VT = N->getValueType(0);
23970 unsigned BinOpc = N1.getOpcode();
23971 if (!TLI.isBinOp(BinOpc) || (N2.getOpcode() != BinOpc))
23974 // The use checks are intentionally on SDNode because we may be dealing
23975 // with opcodes that produce more than one SDValue.
23976 // TODO: Do we really need to check N0 (the condition operand of the select)?
23977 // But removing that clause could cause an infinite loop...
23978 if (!N0->hasOneUse() || !N1->hasOneUse() || !N2->hasOneUse())
23981 // Binops may include opcodes that return multiple values, so all values
23982 // must be created/propagated from the newly created binops below.
23983 SDVTList OpVTs = N1->getVTList();
23985 // Fold select(cond, binop(x, y), binop(z, y))
23986 // --> binop(select(cond, x, z), y)
23987 if (N1.getOperand(1) == N2.getOperand(1)) {
23989 DAG.getSelect(DL, VT, N0, N1.getOperand(0), N2.getOperand(0));
23990 SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, NewSel, N1.getOperand(1));
23991 NewBinOp->setFlags(N1->getFlags());
23992 NewBinOp->intersectFlagsWith(N2->getFlags());
23996 // Fold select(cond, binop(x, y), binop(x, z))
23997 // --> binop(x, select(cond, y, z))
23998 // Second op VT might be different (e.g. shift amount type)
23999 if (N1.getOperand(0) == N2.getOperand(0) &&
24000 VT == N1.getOperand(1).getValueType() &&
24001 VT == N2.getOperand(1).getValueType()) {
24003 DAG.getSelect(DL, VT, N0, N1.getOperand(1), N2.getOperand(1));
24004 SDValue NewBinOp = DAG.getNode(BinOpc, DL, OpVTs, N1.getOperand(0), NewSel);
24005 NewBinOp->setFlags(N1->getFlags());
24006 NewBinOp->intersectFlagsWith(N2->getFlags());
24010 // TODO: Handle isCommutativeBinOp patterns as well?
24014 // Transform (fneg/fabs (bitconvert x)) to avoid loading constant pool values.
24015 SDValue DAGCombiner::foldSignChangeInBitcast(SDNode *N) {
24016 SDValue N0 = N->getOperand(0);
24017 EVT VT = N->getValueType(0);
24018 bool IsFabs = N->getOpcode() == ISD::FABS;
24019 bool IsFree = IsFabs ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT);
24021 if (IsFree || N0.getOpcode() != ISD::BITCAST || !N0.hasOneUse())
24024 SDValue Int = N0.getOperand(0);
24025 EVT IntVT = Int.getValueType();
24027 // The operand to cast should be integer.
24028 if (!IntVT.isInteger() || IntVT.isVector())
24031 // (fneg (bitconvert x)) -> (bitconvert (xor x sign))
24032 // (fabs (bitconvert x)) -> (bitconvert (and x ~sign))
24034 if (N0.getValueType().isVector()) {
24035 // For vector, create a sign mask (0x80...) or its inverse (for fabs,
24036 // 0x7f...) per element and splat it.
24037 SignMask = APInt::getSignMask(N0.getScalarValueSizeInBits());
24039 SignMask = ~SignMask;
24040 SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
24042 // For scalar, just use the sign mask (0x80... or the inverse, 0x7f...)
24043 SignMask = APInt::getSignMask(IntVT.getSizeInBits());
24045 SignMask = ~SignMask;
24048 Int = DAG.getNode(IsFabs ? ISD::AND : ISD::XOR, DL, IntVT, Int,
24049 DAG.getConstant(SignMask, DL, IntVT));
24050 AddToWorklist(Int.getNode());
24051 return DAG.getBitcast(VT, Int);
24054 /// Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
24055 /// where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
24056 /// in it. This may be a win when the constant is not otherwise available
24057 /// because it replaces two constant pool loads with one.
24058 SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
24059 const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
24060 ISD::CondCode CC) {
24061 if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType()))
24064 // If we are before legalize types, we want the other legalization to happen
24065 // first (for example, to avoid messing with soft float).
24066 auto *TV = dyn_cast<ConstantFPSDNode>(N2);
24067 auto *FV = dyn_cast<ConstantFPSDNode>(N3);
24068 EVT VT = N2.getValueType();
24069 if (!TV || !FV || !TLI.isTypeLegal(VT))
24072 // If a constant can be materialized without loads, this does not make sense.
24073 if (TLI.getOperationAction(ISD::ConstantFP, VT) == TargetLowering::Legal ||
24074 TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0), ForCodeSize) ||
24075 TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0), ForCodeSize))
24078 // If both constants have multiple uses, then we won't need to do an extra
24079 // load. The values are likely around in registers for other users.
24080 if (!TV->hasOneUse() && !FV->hasOneUse())
24083 Constant *Elts[] = { const_cast<ConstantFP*>(FV->getConstantFPValue()),
24084 const_cast<ConstantFP*>(TV->getConstantFPValue()) };
24085 Type *FPTy = Elts[0]->getType();
24086 const DataLayout &TD = DAG.getDataLayout();
24088 // Create a ConstantArray of the two constants.
24089 Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
24090 SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
24091 TD.getPrefTypeAlign(FPTy));
24092 Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
24094 // Get offsets to the 0 and 1 elements of the array, so we can select between
24096 SDValue Zero = DAG.getIntPtrConstant(0, DL);
24097 unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
24098 SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
24100 DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()), N0, N1, CC);
24101 AddToWorklist(Cond.getNode());
24102 SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(), Cond, One, Zero);
24103 AddToWorklist(CstOffset.getNode());
24104 CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx, CstOffset);
24105 AddToWorklist(CPIdx.getNode());
24106 return DAG.getLoad(TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
24107 MachinePointerInfo::getConstantPool(
24108 DAG.getMachineFunction()), Alignment);
24111 /// Simplify an expression of the form (N0 cond N1) ? N2 : N3
24112 /// where 'cond' is the comparison specified by CC.
24113 SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
24114 SDValue N2, SDValue N3, ISD::CondCode CC,
24115 bool NotExtCompare) {
24116 // (x ? y : y) -> y.
24117 if (N2 == N3) return N2;
24119 EVT CmpOpVT = N0.getValueType();
24120 EVT CmpResVT = getSetCCResultType(CmpOpVT);
24121 EVT VT = N2.getValueType();
24122 auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
24123 auto *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
24124 auto *N3C = dyn_cast<ConstantSDNode>(N3.getNode());
24126 // Determine if the condition we're dealing with is constant.
24127 if (SDValue SCC = DAG.FoldSetCC(CmpResVT, N0, N1, CC, DL)) {
24128 AddToWorklist(SCC.getNode());
24129 if (auto *SCCC = dyn_cast<ConstantSDNode>(SCC)) {
24130 // fold select_cc true, x, y -> x
24131 // fold select_cc false, x, y -> y
24132 return !(SCCC->isZero()) ? N2 : N3;
24137 convertSelectOfFPConstantsToLoadOffset(DL, N0, N1, N2, N3, CC))
24140 if (SDValue V = foldSelectCCToShiftAnd(DL, N0, N1, N2, N3, CC))
24143 // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
24144 // where y is has a single bit set.
24145 // A plaintext description would be, we can turn the SELECT_CC into an AND
24146 // when the condition can be materialized as an all-ones register. Any
24147 // single bit-test can be materialized as an all-ones register with
24148 // shift-left and shift-right-arith.
24149 if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
24150 N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
24151 SDValue AndLHS = N0->getOperand(0);
24152 auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
24153 if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
24154 // Shift the tested bit over the sign bit.
24155 const APInt &AndMask = ConstAndRHS->getAPIntValue();
24156 unsigned ShCt = AndMask.getBitWidth() - 1;
24157 if (!TLI.shouldAvoidTransformToShift(VT, ShCt)) {
24159 DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
24160 getShiftAmountTy(AndLHS.getValueType()));
24161 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
24163 // Now arithmetic right shift it all the way over, so the result is
24164 // either all-ones, or zero.
24166 DAG.getConstant(ShCt, SDLoc(Shl),
24167 getShiftAmountTy(Shl.getValueType()));
24168 SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
24170 return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
24175 // fold select C, 16, 0 -> shl C, 4
24176 bool Fold = N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2();
24177 bool Swap = N3C && isNullConstant(N2) && N3C->getAPIntValue().isPowerOf2();
24179 if ((Fold || Swap) &&
24180 TLI.getBooleanContents(CmpOpVT) ==
24181 TargetLowering::ZeroOrOneBooleanContent &&
24182 (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, CmpOpVT))) {
24185 CC = ISD::getSetCCInverse(CC, CmpOpVT);
24186 std::swap(N2C, N3C);
24189 // If the caller doesn't want us to simplify this into a zext of a compare,
24191 if (NotExtCompare && N2C->isOne())
24195 // zext (setcc n0, n1)
24197 SCC = DAG.getSetCC(DL, CmpResVT, N0, N1, CC);
24198 if (VT.bitsLT(SCC.getValueType()))
24199 Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2), VT);
24201 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
24203 SCC = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
24204 Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2), VT, SCC);
24207 AddToWorklist(SCC.getNode());
24208 AddToWorklist(Temp.getNode());
24213 unsigned ShCt = N2C->getAPIntValue().logBase2();
24214 if (TLI.shouldAvoidTransformToShift(VT, ShCt))
24217 // shl setcc result by log2 n2c
24218 return DAG.getNode(ISD::SHL, DL, N2.getValueType(), Temp,
24219 DAG.getConstant(ShCt, SDLoc(Temp),
24220 getShiftAmountTy(Temp.getValueType())));
24223 // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
24224 // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
24225 // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
24226 // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
24227 // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
24228 // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
24229 // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
24230 // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
24231 if (N1C && N1C->isZero() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
24232 SDValue ValueOnZero = N2;
24233 SDValue Count = N3;
24234 // If the condition is NE instead of E, swap the operands.
24235 if (CC == ISD::SETNE)
24236 std::swap(ValueOnZero, Count);
24237 // Check if the value on zero is a constant equal to the bits in the type.
24238 if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
24239 if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
24240 // If the other operand is cttz/cttz_zero_undef of N0, and cttz is
24241 // legal, combine to just cttz.
24242 if ((Count.getOpcode() == ISD::CTTZ ||
24243 Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
24244 N0 == Count.getOperand(0) &&
24245 (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT)))
24246 return DAG.getNode(ISD::CTTZ, DL, VT, N0);
24247 // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
24248 // legal, combine to just ctlz.
24249 if ((Count.getOpcode() == ISD::CTLZ ||
24250 Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
24251 N0 == Count.getOperand(0) &&
24252 (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT)))
24253 return DAG.getNode(ISD::CTLZ, DL, VT, N0);
24258 // Fold select_cc setgt X, -1, C, ~C -> xor (ashr X, BW-1), C
24259 // Fold select_cc setlt X, 0, C, ~C -> xor (ashr X, BW-1), ~C
24260 if (!NotExtCompare && N1C && N2C && N3C &&
24261 N2C->getAPIntValue() == ~N3C->getAPIntValue() &&
24262 ((N1C->isAllOnes() && CC == ISD::SETGT) ||
24263 (N1C->isZero() && CC == ISD::SETLT)) &&
24264 !TLI.shouldAvoidTransformToShift(VT, CmpOpVT.getScalarSizeInBits() - 1)) {
24265 SDValue ASR = DAG.getNode(
24266 ISD::SRA, DL, CmpOpVT, N0,
24267 DAG.getConstant(CmpOpVT.getScalarSizeInBits() - 1, DL, CmpOpVT));
24268 return DAG.getNode(ISD::XOR, DL, VT, DAG.getSExtOrTrunc(ASR, DL, VT),
24269 DAG.getSExtOrTrunc(CC == ISD::SETLT ? N3 : N2, DL, VT));
24272 if (SDValue S = PerformMinMaxFpToSatCombine(N0, N1, N2, N3, CC, DAG))
24274 if (SDValue S = PerformUMinFpToSatCombine(N0, N1, N2, N3, CC, DAG))
24280 /// This is a stub for TargetLowering::SimplifySetCC.
24281 SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
24282 ISD::CondCode Cond, const SDLoc &DL,
24283 bool foldBooleans) {
24284 TargetLowering::DAGCombinerInfo
24285 DagCombineInfo(DAG, Level, false, this);
24286 return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
24289 /// Given an ISD::SDIV node expressing a divide by constant, return
24290 /// a DAG expression to select that will generate the same value by multiplying
24291 /// by a magic number.
24292 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
24293 SDValue DAGCombiner::BuildSDIV(SDNode *N) {
24294 // when optimising for minimum size, we don't want to expand a div to a mul
24296 if (DAG.getMachineFunction().getFunction().hasMinSize())
24299 SmallVector<SDNode *, 8> Built;
24300 if (SDValue S = TLI.BuildSDIV(N, DAG, LegalOperations, Built)) {
24301 for (SDNode *N : Built)
24309 /// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
24310 /// DAG expression that will generate the same value by right shifting.
24311 SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
24312 ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
24316 // Avoid division by zero.
24320 SmallVector<SDNode *, 8> Built;
24321 if (SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, Built)) {
24322 for (SDNode *N : Built)
24330 /// Given an ISD::UDIV node expressing a divide by constant, return a DAG
24331 /// expression that will generate the same value by multiplying by a magic
24333 /// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
24334 SDValue DAGCombiner::BuildUDIV(SDNode *N) {
24335 // when optimising for minimum size, we don't want to expand a div to a mul
24337 if (DAG.getMachineFunction().getFunction().hasMinSize())
24340 SmallVector<SDNode *, 8> Built;
24341 if (SDValue S = TLI.BuildUDIV(N, DAG, LegalOperations, Built)) {
24342 for (SDNode *N : Built)
24350 /// Given an ISD::SREM node expressing a remainder by constant power of 2,
24351 /// return a DAG expression that will generate the same value.
24352 SDValue DAGCombiner::BuildSREMPow2(SDNode *N) {
24353 ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
24357 // Avoid division by zero.
24361 SmallVector<SDNode *, 8> Built;
24362 if (SDValue S = TLI.BuildSREMPow2(N, C->getAPIntValue(), DAG, Built)) {
24363 for (SDNode *N : Built)
24371 /// Determines the LogBase2 value for a non-null input value using the
24372 /// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
24373 SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
24374 EVT VT = V.getValueType();
24375 SDValue Ctlz = DAG.getNode(ISD::CTLZ, DL, VT, V);
24376 SDValue Base = DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT);
24377 SDValue LogBase2 = DAG.getNode(ISD::SUB, DL, VT, Base, Ctlz);
24381 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
24382 /// For the reciprocal, we need to find the zero of the function:
24383 /// F(X) = 1/X - A [which has a zero at X = 1/A]
24385 /// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
24386 /// does not require additional intermediate precision]
24387 /// For the last iteration, put numerator N into it to gain more precision:
24388 /// Result = N X_i + X_i (N - N A X_i)
24389 SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
24390 SDNodeFlags Flags) {
24394 // TODO: Handle extended types?
24395 EVT VT = Op.getValueType();
24396 if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
24397 VT.getScalarType() != MVT::f64)
24400 // If estimates are explicitly disabled for this function, we're done.
24401 MachineFunction &MF = DAG.getMachineFunction();
24402 int Enabled = TLI.getRecipEstimateDivEnabled(VT, MF);
24403 if (Enabled == TLI.ReciprocalEstimate::Disabled)
24406 // Estimates may be explicitly enabled for this type with a custom number of
24407 // refinement steps.
24408 int Iterations = TLI.getDivRefinementSteps(VT, MF);
24409 if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
24410 AddToWorklist(Est.getNode());
24414 SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
24416 // Newton iterations: Est = Est + Est (N - Arg * Est)
24417 // If this is the last iteration, also multiply by the numerator.
24418 for (int i = 0; i < Iterations; ++i) {
24419 SDValue MulEst = Est;
24421 if (i == Iterations - 1) {
24422 MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
24423 AddToWorklist(MulEst.getNode());
24426 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
24427 AddToWorklist(NewEst.getNode());
24429 NewEst = DAG.getNode(ISD::FSUB, DL, VT,
24430 (i == Iterations - 1 ? N : FPOne), NewEst, Flags);
24431 AddToWorklist(NewEst.getNode());
24433 NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
24434 AddToWorklist(NewEst.getNode());
24436 Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
24437 AddToWorklist(Est.getNode());
24440 // If no iterations are available, multiply with N.
24441 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
24442 AddToWorklist(Est.getNode());
24451 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
24452 /// For the reciprocal sqrt, we need to find the zero of the function:
24453 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
24455 /// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
24456 /// As a result, we precompute A/2 prior to the iteration loop.
24457 SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
24458 unsigned Iterations,
24459 SDNodeFlags Flags, bool Reciprocal) {
24460 EVT VT = Arg.getValueType();
24462 SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
24464 // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
24465 // this entire sequence requires only one FP constant.
24466 SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
24467 HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
24469 // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
24470 for (unsigned i = 0; i < Iterations; ++i) {
24471 SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
24472 NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
24473 NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
24474 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
24477 // If non-reciprocal square root is requested, multiply the result by Arg.
24479 Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
24484 /// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
24485 /// For the reciprocal sqrt, we need to find the zero of the function:
24486 /// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
24488 /// X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
24489 SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
24490 unsigned Iterations,
24491 SDNodeFlags Flags, bool Reciprocal) {
24492 EVT VT = Arg.getValueType();
24494 SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
24495 SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
24497 // This routine must enter the loop below to work correctly
24498 // when (Reciprocal == false).
24499 assert(Iterations > 0);
24501 // Newton iterations for reciprocal square root:
24502 // E = (E * -0.5) * ((A * E) * E + -3.0)
24503 for (unsigned i = 0; i < Iterations; ++i) {
24504 SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
24505 SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
24506 SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
24508 // When calculating a square root at the last iteration build:
24509 // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
24510 // (notice a common subexpression)
24512 if (Reciprocal || (i + 1) < Iterations) {
24513 // RSQRT: LHS = (E * -0.5)
24514 LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
24516 // SQRT: LHS = (A * E) * -0.5
24517 LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
24520 Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
24526 /// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
24527 /// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
24528 /// Op can be zero.
24529 SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
24534 // TODO: Handle extended types?
24535 EVT VT = Op.getValueType();
24536 if (VT.getScalarType() != MVT::f16 && VT.getScalarType() != MVT::f32 &&
24537 VT.getScalarType() != MVT::f64)
24540 // If estimates are explicitly disabled for this function, we're done.
24541 MachineFunction &MF = DAG.getMachineFunction();
24542 int Enabled = TLI.getRecipEstimateSqrtEnabled(VT, MF);
24543 if (Enabled == TLI.ReciprocalEstimate::Disabled)
24546 // Estimates may be explicitly enabled for this type with a custom number of
24547 // refinement steps.
24548 int Iterations = TLI.getSqrtRefinementSteps(VT, MF);
24550 bool UseOneConstNR = false;
24552 TLI.getSqrtEstimate(Op, DAG, Enabled, Iterations, UseOneConstNR,
24554 AddToWorklist(Est.getNode());
24557 Est = UseOneConstNR
24558 ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
24559 : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
24562 // Try the target specific test first.
24563 SDValue Test = TLI.getSqrtInputTest(Op, DAG, DAG.getDenormalMode(VT));
24565 // The estimate is now completely wrong if the input was exactly 0.0 or
24566 // possibly a denormal. Force the answer to 0.0 or value provided by
24567 // target for those cases.
24569 Test.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT,
24570 Test, TLI.getSqrtResultForDenormInput(Op, DAG), Est);
24578 SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags) {
24579 return buildSqrtEstimateImpl(Op, Flags, true);
24582 SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags Flags) {
24583 return buildSqrtEstimateImpl(Op, Flags, false);
24586 /// Return true if there is any possibility that the two addresses overlap.
24587 bool DAGCombiner::mayAlias(SDNode *Op0, SDNode *Op1) const {
24589 struct MemUseCharacteristics {
24594 Optional<int64_t> NumBytes;
24595 MachineMemOperand *MMO;
24598 auto getCharacteristics = [](SDNode *N) -> MemUseCharacteristics {
24599 if (const auto *LSN = dyn_cast<LSBaseSDNode>(N)) {
24600 int64_t Offset = 0;
24601 if (auto *C = dyn_cast<ConstantSDNode>(LSN->getOffset()))
24602 Offset = (LSN->getAddressingMode() == ISD::PRE_INC)
24603 ? C->getSExtValue()
24604 : (LSN->getAddressingMode() == ISD::PRE_DEC)
24605 ? -1 * C->getSExtValue()
24608 MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
24609 return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
24610 Offset /*base offset*/,
24611 Optional<int64_t>(Size),
24612 LSN->getMemOperand()};
24614 if (const auto *LN = cast<LifetimeSDNode>(N))
24615 return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1),
24616 (LN->hasOffset()) ? LN->getOffset() : 0,
24617 (LN->hasOffset()) ? Optional<int64_t>(LN->getSize())
24618 : Optional<int64_t>(),
24619 (MachineMemOperand *)nullptr};
24621 return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(),
24622 (int64_t)0 /*offset*/,
24623 Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr};
24626 MemUseCharacteristics MUC0 = getCharacteristics(Op0),
24627 MUC1 = getCharacteristics(Op1);
24629 // If they are to the same address, then they must be aliases.
24630 if (MUC0.BasePtr.getNode() && MUC0.BasePtr == MUC1.BasePtr &&
24631 MUC0.Offset == MUC1.Offset)
24634 // If they are both volatile then they cannot be reordered.
24635 if (MUC0.IsVolatile && MUC1.IsVolatile)
24638 // Be conservative about atomics for the moment
24639 // TODO: This is way overconservative for unordered atomics (see D66309)
24640 if (MUC0.IsAtomic && MUC1.IsAtomic)
24643 if (MUC0.MMO && MUC1.MMO) {
24644 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
24645 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
24649 // Try to prove that there is aliasing, or that there is no aliasing. Either
24650 // way, we can return now. If nothing can be proved, proceed with more tests.
24652 if (BaseIndexOffset::computeAliasing(Op0, MUC0.NumBytes, Op1, MUC1.NumBytes,
24656 // The following all rely on MMO0 and MMO1 being valid. Fail conservatively if
24657 // either are not known.
24658 if (!MUC0.MMO || !MUC1.MMO)
24661 // If one operation reads from invariant memory, and the other may store, they
24662 // cannot alias. These should really be checking the equivalent of mayWrite,
24663 // but it only matters for memory nodes other than load /store.
24664 if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
24665 (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
24668 // If we know required SrcValue1 and SrcValue2 have relatively large
24669 // alignment compared to the size and offset of the access, we may be able
24670 // to prove they do not alias. This check is conservative for now to catch
24671 // cases created by splitting vector types, it only works when the offsets are
24672 // multiples of the size of the data.
24673 int64_t SrcValOffset0 = MUC0.MMO->getOffset();
24674 int64_t SrcValOffset1 = MUC1.MMO->getOffset();
24675 Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
24676 Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
24677 auto &Size0 = MUC0.NumBytes;
24678 auto &Size1 = MUC1.NumBytes;
24679 if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
24680 Size0.has_value() && Size1.has_value() && *Size0 == *Size1 &&
24681 OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
24682 SrcValOffset1 % *Size1 == 0) {
24683 int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
24684 int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
24686 // There is no overlap between these relatively aligned accesses of
24687 // similar size. Return no alias.
24688 if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
24692 bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
24694 : DAG.getSubtarget().useAA();
24696 if (CombinerAAOnlyFunc.getNumOccurrences() &&
24697 CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
24701 if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() && Size0 &&
24703 // Use alias analysis information.
24704 int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
24705 int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
24706 int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
24708 MemoryLocation(MUC0.MMO->getValue(), Overlap0,
24709 UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
24710 MemoryLocation(MUC1.MMO->getValue(), Overlap1,
24711 UseTBAA ? MUC1.MMO->getAAInfo() : AAMDNodes())))
24715 // Otherwise we have to assume they alias.
24719 /// Walk up chain skipping non-aliasing memory nodes,
24720 /// looking for aliasing nodes and adding them to the Aliases vector.
24721 void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
24722 SmallVectorImpl<SDValue> &Aliases) {
24723 SmallVector<SDValue, 8> Chains; // List of chains to visit.
24724 SmallPtrSet<SDNode *, 16> Visited; // Visited node set.
24726 // Get alias information for node.
24727 // TODO: relax aliasing for unordered atomics (see D66309)
24728 const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();
24731 Chains.push_back(OriginalChain);
24732 unsigned Depth = 0;
24734 // Attempt to improve chain by a single step
24735 auto ImproveChain = [&](SDValue &C) -> bool {
24736 switch (C.getOpcode()) {
24737 case ISD::EntryToken:
24738 // No need to mark EntryToken.
24743 // Get alias information for C.
24744 // TODO: Relax aliasing for unordered atomics (see D66309)
24745 bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
24746 cast<LSBaseSDNode>(C.getNode())->isSimple();
24747 if ((IsLoad && IsOpLoad) || !mayAlias(N, C.getNode())) {
24748 // Look further up the chain.
24749 C = C.getOperand(0);
24752 // Alias, so stop here.
24756 case ISD::CopyFromReg:
24757 // Always forward past past CopyFromReg.
24758 C = C.getOperand(0);
24761 case ISD::LIFETIME_START:
24762 case ISD::LIFETIME_END: {
24763 // We can forward past any lifetime start/end that can be proven not to
24764 // alias the memory access.
24765 if (!mayAlias(N, C.getNode())) {
24766 // Look further up the chain.
24767 C = C.getOperand(0);
24777 // Look at each chain and determine if it is an alias. If so, add it to the
24778 // aliases list. If not, then continue up the chain looking for the next
24780 while (!Chains.empty()) {
24781 SDValue Chain = Chains.pop_back_val();
24783 // Don't bother if we've seen Chain before.
24784 if (!Visited.insert(Chain.getNode()).second)
24787 // For TokenFactor nodes, look at each operand and only continue up the
24788 // chain until we reach the depth limit.
24790 // FIXME: The depth check could be made to return the last non-aliasing
24791 // chain we found before we hit a tokenfactor rather than the original
24793 if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
24795 Aliases.push_back(OriginalChain);
24799 if (Chain.getOpcode() == ISD::TokenFactor) {
24800 // We have to check each of the operands of the token factor for "small"
24801 // token factors, so we queue them up. Adding the operands to the queue
24802 // (stack) in reverse order maintains the original order and increases the
24803 // likelihood that getNode will find a matching token factor (CSE.)
24804 if (Chain.getNumOperands() > 16) {
24805 Aliases.push_back(Chain);
24808 for (unsigned n = Chain.getNumOperands(); n;)
24809 Chains.push_back(Chain.getOperand(--n));
24814 if (ImproveChain(Chain)) {
24815 // Updated Chain Found, Consider new chain if one exists.
24816 if (Chain.getNode())
24817 Chains.push_back(Chain);
24821 // No Improved Chain Possible, treat as Alias.
24822 Aliases.push_back(Chain);
24826 /// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
24827 /// (aliasing node.)
24828 SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
24829 if (OptLevel == CodeGenOpt::None)
24832 // Ops for replacing token factor.
24833 SmallVector<SDValue, 8> Aliases;
24835 // Accumulate all the aliases to this node.
24836 GatherAllAliases(N, OldChain, Aliases);
24838 // If no operands then chain to entry token.
24839 if (Aliases.size() == 0)
24840 return DAG.getEntryNode();
24842 // If a single operand then chain to it. We don't need to revisit it.
24843 if (Aliases.size() == 1)
24846 // Construct a custom tailored token factor.
24847 return DAG.getTokenFactor(SDLoc(N), Aliases);
24851 // TODO: Replace with with std::monostate when we move to C++17.
24852 struct UnitT { } Unit;
24853 bool operator==(const UnitT &, const UnitT &) { return true; }
24854 bool operator!=(const UnitT &, const UnitT &) { return false; }
24857 // This function tries to collect a bunch of potentially interesting
24858 // nodes to improve the chains of, all at once. This might seem
24859 // redundant, as this function gets called when visiting every store
24860 // node, so why not let the work be done on each store as it's visited?
24862 // I believe this is mainly important because mergeConsecutiveStores
24863 // is unable to deal with merging stores of different sizes, so unless
24864 // we improve the chains of all the potential candidates up-front
24865 // before running mergeConsecutiveStores, it might only see some of
24866 // the nodes that will eventually be candidates, and then not be able
24867 // to go from a partially-merged state to the desired final
24868 // fully-merged state.
24870 bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
24871 SmallVector<StoreSDNode *, 8> ChainedStores;
24872 StoreSDNode *STChain = St;
24873 // Intervals records which offsets from BaseIndex have been covered. In
24874 // the common case, every store writes to the immediately previous address
24875 // space and thus merged with the previous interval at insertion time.
24878 llvm::IntervalMap<int64_t, UnitT, 8, IntervalMapHalfOpenInfo<int64_t>>;
24882 // This holds the base pointer, index, and the offset in bytes from the base
24884 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
24886 // We must have a base and an offset.
24887 if (!BasePtr.getBase().getNode())
24890 // Do not handle stores to undef base pointers.
24891 if (BasePtr.getBase().isUndef())
24894 // Do not handle stores to opaque types
24895 if (St->getMemoryVT().isZeroSized())
24898 // BaseIndexOffset assumes that offsets are fixed-size, which
24899 // is not valid for scalable vectors where the offsets are
24900 // scaled by `vscale`, so bail out early.
24901 if (St->getMemoryVT().isScalableVector())
24904 // Add ST's interval.
24905 Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit);
24907 while (StoreSDNode *Chain = dyn_cast<StoreSDNode>(STChain->getChain())) {
24908 if (Chain->getMemoryVT().isScalableVector())
24911 // If the chain has more than one use, then we can't reorder the mem ops.
24912 if (!SDValue(Chain, 0)->hasOneUse())
24914 // TODO: Relax for unordered atomics (see D66309)
24915 if (!Chain->isSimple() || Chain->isIndexed())
24918 // Find the base pointer and offset for this memory node.
24919 const BaseIndexOffset Ptr = BaseIndexOffset::match(Chain, DAG);
24920 // Check that the base pointer is the same as the original one.
24922 if (!BasePtr.equalBaseIndex(Ptr, DAG, Offset))
24924 int64_t Length = (Chain->getMemoryVT().getSizeInBits() + 7) / 8;
24925 // Make sure we don't overlap with other intervals by checking the ones to
24926 // the left or right before inserting.
24927 auto I = Intervals.find(Offset);
24928 // If there's a next interval, we should end before it.
24929 if (I != Intervals.end() && I.start() < (Offset + Length))
24931 // If there's a previous interval, we should start after it.
24932 if (I != Intervals.begin() && (--I).stop() <= Offset)
24934 Intervals.insert(Offset, Offset + Length, Unit);
24936 ChainedStores.push_back(Chain);
24940 // If we didn't find a chained store, exit.
24941 if (ChainedStores.size() == 0)
24944 // Improve all chained stores (St and ChainedStores members) starting from
24945 // where the store chain ended and return single TokenFactor.
24946 SDValue NewChain = STChain->getChain();
24947 SmallVector<SDValue, 8> TFOps;
24948 for (unsigned I = ChainedStores.size(); I;) {
24949 StoreSDNode *S = ChainedStores[--I];
24950 SDValue BetterChain = FindBetterChain(S, NewChain);
24951 S = cast<StoreSDNode>(DAG.UpdateNodeOperands(
24952 S, BetterChain, S->getOperand(1), S->getOperand(2), S->getOperand(3)));
24953 TFOps.push_back(SDValue(S, 0));
24954 ChainedStores[I] = S;
24957 // Improve St's chain. Use a new node to avoid creating a loop from CombineTo.
24958 SDValue BetterChain = FindBetterChain(St, NewChain);
24960 if (St->isTruncatingStore())
24961 NewST = DAG.getTruncStore(BetterChain, SDLoc(St), St->getValue(),
24962 St->getBasePtr(), St->getMemoryVT(),
24963 St->getMemOperand());
24965 NewST = DAG.getStore(BetterChain, SDLoc(St), St->getValue(),
24966 St->getBasePtr(), St->getMemOperand());
24968 TFOps.push_back(NewST);
24970 // If we improved every element of TFOps, then we've lost the dependence on
24971 // NewChain to successors of St and we need to add it back to TFOps. Do so at
24972 // the beginning to keep relative order consistent with FindBetterChains.
24973 auto hasImprovedChain = [&](SDValue ST) -> bool {
24974 return ST->getOperand(0) != NewChain;
24976 bool AddNewChain = llvm::all_of(TFOps, hasImprovedChain);
24978 TFOps.insert(TFOps.begin(), NewChain);
24980 SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps);
24983 // Add TF and its operands to the worklist.
24984 AddToWorklist(TF.getNode());
24985 for (const SDValue &Op : TF->ops())
24986 AddToWorklist(Op.getNode());
24987 AddToWorklist(STChain);
24991 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
24992 if (OptLevel == CodeGenOpt::None)
24995 const BaseIndexOffset BasePtr = BaseIndexOffset::match(St, DAG);
24997 // We must have a base and an offset.
24998 if (!BasePtr.getBase().getNode())
25001 // Do not handle stores to undef base pointers.
25002 if (BasePtr.getBase().isUndef())
25005 // Directly improve a chain of disjoint stores starting at St.
25006 if (parallelizeChainedStores(St))
25009 // Improve St's Chain..
25010 SDValue BetterChain = FindBetterChain(St, St->getChain());
25011 if (St->getChain() != BetterChain) {
25012 replaceStoreChain(St, BetterChain);
25018 /// This is the entry point for the file.
25019 void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA,
25020 CodeGenOpt::Level OptLevel) {
25021 /// This is the main entry point to this class.
25022 DAGCombiner(*this, AA, OptLevel).Run(Level);