If we're inserting into v2i8/v4i8/v8i8/v2i16/v4i16 style sub-128bit vectors ensure we don't use the SK_PermuteTwoSrc cost of the legalized value type - this is a followup to rG12c629ec6c59 which added equivalent sub-128bit shuffle costs
// should be very cheap (assume cost = 1). For insertions we need to shuffle
// the elements to its destination. In both cases we must handle the
// subvector move(s).
+ // If the vector type is already less than 128-bits then don't reduce it.
// TODO: Under what circumstances should we shuffle using the full width?
int ShuffleCost = 1;
if (Opcode == Instruction::InsertElement) {
- Type *SubTy = VectorType::get(Val->getVectorElementType(), SubNumElts);
+ Type *SubTy = Val;
+ EVT VT = TLI->getValueType(DL, Val);
+ if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
+ SubTy = VectorType::get(ScalarType, SubNumElts);
ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
}
int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
define i32 @fptosi_double_i8(i32 %arg) {
; SSE2-LABEL: 'fptosi_double_i8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi double undef to i8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = fptosi <2 x double> undef to <2 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I8 = fptosi <4 x double> undef to <4 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I8 = fptosi <8 x double> undef to <8 x i8>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptosi_double_i8'
define i32 @fptosi_float_i8(i32 %arg) {
; SSE2-LABEL: 'fptosi_float_i8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptosi float undef to i8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I8 = fptosi <4 x float> undef to <4 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I8 = fptosi <8 x float> undef to <8 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V16I8 = fptosi <16 x float> undef to <16 x i8>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptosi_float_i8'
define i32 @fptoui_double_i8(i32 %arg) {
; SSE2-LABEL: 'fptoui_double_i8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui double undef to i8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V2I8 = fptoui <2 x double> undef to <2 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4I8 = fptoui <4 x double> undef to <4 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V8I8 = fptoui <8 x double> undef to <8 x i8>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptoui_double_i8'
define i32 @fptoui_float_i8(i32 %arg) {
; SSE2-LABEL: 'fptoui_float_i8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %I8 = fptoui float undef to i8
-; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 135 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 271 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V4I8 = fptoui <4 x float> undef to <4 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 63 for instruction: %V8I8 = fptoui <8 x float> undef to <8 x i8>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 127 for instruction: %V16I8 = fptoui <16 x float> undef to <16 x i8>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'fptoui_float_i8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1148 for instruction: %V64I8 = call <64 x i8> @llvm.masked.load.v64i8.p0v64i8(<64 x i8>* undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 574 for instruction: %V32I8 = call <32 x i8> @llvm.masked.load.v32i8.p0v32i8(<32 x i8>* undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 287 for instruction: %V16I8 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 143 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V8I8 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
; SSE42-LABEL: 'masked_load'
; SSE2-NEXT: Cost Model: Found an estimated cost of 960 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
; SSE42-LABEL: 'masked_gather'
; SSE2-NEXT: Cost Model: Found an estimated cost of 960 for instruction: %V64I8 = call <64 x i8> @llvm.masked.expandload.v64i8(i8* undef, <64 x i1> undef, <64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 480 for instruction: %V32I8 = call <32 x i8> @llvm.masked.expandload.v32i8(i8* undef, <32 x i1> undef, <32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 240 for instruction: %V16I8 = call <16 x i8> @llvm.masked.expandload.v16i8(i8* undef, <16 x i1> undef, <16 x i8> undef)
-; SSE2-NEXT: Cost Model: Found an estimated cost of 120 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
+; SSE2-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V8I8 = call <8 x i8> @llvm.masked.expandload.v8i8(i8* undef, <8 x i1> undef, <8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 0
;
; SSE42-LABEL: 'masked_expandload'
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128_CD = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 12, i32 13>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V128_EF = shufflevector <16 x i8> %src128, <16 x i8> undef, <2 x i32> <i32 14, i32 15>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_0123 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128_2345 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128_6789 = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_89AB = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V128_CDEF = shufflevector <16 x i8> %src128, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V128_01234567 = shufflevector <16 x i8> %src128, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1C_1D = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 28, i32 29>
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V256_1E_1F = shufflevector <32 x i8> %src256, <32 x i8> undef, <2 x i32> <i32 30, i32 31>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_00_01_02_03 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V256_02_03_04_05 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_04_05_06_07 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; SSE2-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
+; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V256_06_07_08_09 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 6, i32 7, i32 8, i32 9>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_08_09_0A_0B = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V256_0C_0D_0E_0F = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V256_10_11_12_13 = shufflevector <32 x i8> %src256, <32 x i8> undef, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
define i32 @insert_i8(i32 %arg) {
; SSE2-LABEL: 'insert_i8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg
-; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
-; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1
+; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg
-; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
-; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3
+; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg
-; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
-; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
+; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7
; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8
;
; SSE3-LABEL: 'insert_i8'
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2i8_a = insertelement <2 x i8> undef, i8 undef, i32 %arg
-; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
-; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1
+; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i8_0 = insertelement <2 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v2i8_3 = insertelement <2 x i8> undef, i8 undef, i32 1
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v4i8_a = insertelement <4 x i8> undef, i8 undef, i32 %arg
-; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
-; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3
+; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i8_0 = insertelement <4 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v4i8_3 = insertelement <4 x i8> undef, i8 undef, i32 3
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v8i8_a = insertelement <8 x i8> undef, i8 undef, i32 %arg
-; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
-; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7
+; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_0 = insertelement <8 x i8> undef, i8 undef, i32 0
+; SSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v8i8_7 = insertelement <8 x i8> undef, i8 undef, i32 7
; SSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16i8_a = insertelement <16 x i8> undef, i8 undef, i32 %arg
; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_0 = insertelement <16 x i8> undef, i8 undef, i32 0
; SSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v16i8_8 = insertelement <16 x i8> undef, i8 undef, i32 8