/// efficiently, casting the load to a smaller vector of larger types and
/// loading is more efficient, however, this can be undone by optimizations in
/// dag combiner.
- virtual bool isLoadBitCastBeneficial(EVT LoadVT,
- EVT BitcastVT) const {
+ virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
// Don't do if we could do an indexed load on the original type, but not on
// the new one.
if (!LoadVT.isSimple() || !BitcastVT.isSimple())
getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())
return false;
- return true;
+ bool Fast = false;
+ return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT,
+ MMO, &Fast) && Fast;
}
/// Return true if the following transform is beneficial:
/// (store (y (conv x)), y*)) -> (store x, (x*))
- virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const {
+ virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
// Default to the same logic as loads.
- return isLoadBitCastBeneficial(StoreVT, BitcastVT);
+ return isLoadBitCastBeneficial(StoreVT, BitcastVT, DAG, MMO);
}
/// Return true if it is expected to be cheaper to do a store of a non-zero
// as we assume software couldn't rely on the number of accesses of an
// illegal type.
((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
- TLI.isOperationLegal(ISD::LOAD, VT)) &&
- TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
+ TLI.isOperationLegal(ISD::LOAD, VT))) {
LoadSDNode *LN0 = cast<LoadSDNode>(N0);
- bool Fast = false;
- if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- *LN0->getMemOperand(), &Fast) &&
- Fast) {
+ if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG,
+ *LN0->getMemOperand())) {
SDValue Load =
DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
LN0->getPointerInfo(), LN0->getAlignment(),
// illegal type.
if (((!LegalOperations && !ST->isVolatile()) ||
TLI.isOperationLegal(ISD::STORE, SVT)) &&
- TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
- bool Fast = false;
- if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
- *ST->getMemOperand(), &Fast) &&
- Fast) {
- return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
- ST->getPointerInfo(), ST->getAlignment(),
- ST->getMemOperand()->getFlags(), ST->getAAInfo());
- }
+ TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
+ DAG, *ST->getMemOperand())) {
+ return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+ ST->getPointerInfo(), ST->getAlignment(),
+ ST->getMemOperand()->getFlags(), ST->getAAInfo());
}
}
return (OldSize < 32);
}
-bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,
- EVT CastTy) const {
+bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
unsigned LScalarSize = LoadTy.getScalarSizeInBits();
unsigned CastScalarSize = CastTy.getScalarSizeInBits();
- return (LScalarSize < CastScalarSize) ||
- (CastScalarSize >= 32);
+ if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
+ return false;
+
+ bool Fast = false;
+ return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
+ MMO, &Fast) && Fast;
}
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
ISD::LoadExtType ExtType,
EVT ExtVT) const override;
- bool isLoadBitCastBeneficial(EVT, EVT) const final;
+ bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const final;
bool storeOfVectorConstantIsCheap(EVT MemVT,
unsigned NumElem,
return Subtarget.hasLZCNT();
}
-bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
- EVT BitcastVT) const {
+bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const {
if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&
BitcastVT.getVectorElementType() == MVT::i1)
return false;
if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)
return false;
- return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
+ // If both types are legal vectors, it's always ok to convert them.
+ if (LoadVT.isVector() && BitcastVT.isVector() &&
+ isTypeLegal(LoadVT) && isTypeLegal(BitcastVT))
+ return true;
+
+ return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);
}
bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
return NumElem > 2;
}
- bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override;
+ bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT,
+ const SelectionDAG &DAG,
+ const MachineMemOperand &MMO) const override;
/// Intel processors have a unified instruction and data cache
const char * getClearCacheBuiltinName() const override {
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, (%eax)
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
; X86-SSE2-NEXT: movd %xmm1, %ecx
; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT: movd %xmm1, %ecx
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
; X86-SSE2-NEXT: retl
;
; X86-SSE2-NEXT: movdqu 16(%ecx), %xmm1
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, (%eax)
-; X86-SSE2-NEXT: movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 12(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; X86-SSE2-NEXT: movd %xmm2, %ecx
; X86-SSE2-NEXT: movntil %ecx, 8(%eax)
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 4(%eax)
; X86-SSE2-NEXT: movd %xmm1, %ecx
; X86-SSE2-NEXT: movntil %ecx, 16(%eax)
-; X86-SSE2-NEXT: movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3]
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 28(%eax)
; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 24(%eax)
-; X86-SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
-; X86-SSE2-NEXT: movd %xmm1, %ecx
+; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; X86-SSE2-NEXT: movd %xmm0, %ecx
; X86-SSE2-NEXT: movntil %ecx, 20(%eax)
; X86-SSE2-NEXT: retl
;
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL
define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {
; SSE-LABEL: shuffle_v4i32_0001: