{ ISD::MUL, MVT::v32i8, 13 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 5 }, // extend/pmullw/trunc sequence.
- { ISD::MUL, MVT::v16i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v16i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v8i32, 1 }, // pmulld (Skylake from agner.org)
+ { ISD::MUL, MVT::v4i32, 1 }, // pmulld (Skylake from agner.org)
{ ISD::MUL, MVT::v8i64, 8 }, // 3*pmuludq/3*shift/2*add
// Vectorizing division is a bad idea. See the SSE2 table for more comments.
{ ISD::MUL, MVT::v32i8, 17 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i8, 7 }, // extend/pmullw/trunc sequence.
{ ISD::MUL, MVT::v16i16, 1 }, // pmullw
- { ISD::MUL, MVT::v8i32, 1 }, // pmulld
+ { ISD::MUL, MVT::v8i32, 2 }, // pmulld (Haswell from agner.org)
{ ISD::MUL, MVT::v4i64, 8 }, // 3*pmuludq/3*shift/2*add
{ ISD::FDIV, MVT::f32, 7 }, // Haswell from http://www.agner.org/
{ ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend.
{ ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split.
- { ISD::MUL, MVT::v4i32, 1 } // pmulld
+ { ISD::MUL, MVT::v4i32, 2 } // pmulld (Nehalem from agner.org)
};
if (ST->hasSSE41())
; CHECK: cost of 1 {{.*}} %I32 = mul
%I32 = mul i32 undef, undef
; SSSE3: cost of 6 {{.*}} %V4I32 = mul
- ; SSE42: cost of 1 {{.*}} %V4I32 = mul
- ; AVX: cost of 1 {{.*}} %V4I32 = mul
- ; AVX2: cost of 1 {{.*}} %V4I32 = mul
+ ; SSE42: cost of 2 {{.*}} %V4I32 = mul
+ ; AVX: cost of 2 {{.*}} %V4I32 = mul
+ ; AVX2: cost of 2 {{.*}} %V4I32 = mul
; AVX512: cost of 1 {{.*}} %V4I32 = mul
%V4I32 = mul <4 x i32> undef, undef
; SSSE3: cost of 12 {{.*}} %V8I32 = mul
- ; SSE42: cost of 2 {{.*}} %V8I32 = mul
+ ; SSE42: cost of 4 {{.*}} %V8I32 = mul
; AVX: cost of 4 {{.*}} %V8I32 = mul
- ; AVX2: cost of 1 {{.*}} %V8I32 = mul
+ ; AVX2: cost of 2 {{.*}} %V8I32 = mul
; AVX512: cost of 1 {{.*}} %V8I32 = mul
%V8I32 = mul <8 x i32> undef, undef
; SSSE3: cost of 24 {{.*}} %V16I32 = mul
- ; SSE42: cost of 4 {{.*}} %V16I32 = mul
+ ; SSE42: cost of 8 {{.*}} %V16I32 = mul
; AVX: cost of 8 {{.*}} %V16I32 = mul
- ; AVX2: cost of 2 {{.*}} %V16I32 = mul
+ ; AVX2: cost of 4 {{.*}} %V16I32 = mul
; AVX512: cost of 1 {{.*}} %V16I32 = mul
%V16I32 = mul <16 x i32> undef, undef
define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i32':
; SSE2: Found an estimated cost of 6 for instruction: %shift
-; SSE41: Found an estimated cost of 1 for instruction: %shift
-; AVX: Found an estimated cost of 1 for instruction: %shift
+; SSE41: Found an estimated cost of 2 for instruction: %shift
+; AVX: Found an estimated cost of 2 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; XOPAVX: Found an estimated cost of 1 for instruction: %shift
define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32':
; SSE2: Found an estimated cost of 12 for instruction: %shift
-; SSE41: Found an estimated cost of 2 for instruction: %shift
+; SSE41: Found an estimated cost of 4 for instruction: %shift
; AVX: Found an estimated cost of 4 for instruction: %shift
; AVX2: Found an estimated cost of 1 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) {
; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32':
; SSE2: Found an estimated cost of 24 for instruction: %shift
-; SSE41: Found an estimated cost of 4 for instruction: %shift
+; SSE41: Found an estimated cost of 8 for instruction: %shift
; AVX: Found an estimated cost of 8 for instruction: %shift
; AVX2: Found an estimated cost of 2 for instruction: %shift
; AVX512: Found an estimated cost of 1 for instruction: %shift
; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
-; Make sure that the estimated cost is always 1 except for the case where
+; Make sure that the estimated cost is always 2 except for the case where
; we only have SSE2 support. With SSE2, we are forced to special lower the
; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
}
; CHECK: 'Cost Model Analysis' for function 'test3':
; SSE2: Found an estimated cost of 6 for instruction: %shl
-; SSE41: Found an estimated cost of 1 for instruction: %shl
-; AVX: Found an estimated cost of 1 for instruction: %shl
+; SSE41: Found an estimated cost of 2 for instruction: %shl
+; AVX: Found an estimated cost of 2 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
; XOP: Found an estimated cost of 1 for instruction: %shl
}
; CHECK: 'Cost Model Analysis' for function 'test4':
; SSE2: Found an estimated cost of 6 for instruction: %shl
-; SSE41: Found an estimated cost of 1 for instruction: %shl
-; AVX: Found an estimated cost of 1 for instruction: %shl
+; SSE41: Found an estimated cost of 2 for instruction: %shl
+; AVX: Found an estimated cost of 2 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
; XOP: Found an estimated cost of 1 for instruction: %shl
}
; CHECK: 'Cost Model Analysis' for function 'test7':
; SSE2: Found an estimated cost of 12 for instruction: %shl
-; SSE41: Found an estimated cost of 2 for instruction: %shl
+; SSE41: Found an estimated cost of 4 for instruction: %shl
; AVX: Found an estimated cost of 4 for instruction: %shl
; AVX2: Found an estimated cost of 1 for instruction: %shl
; XOPAVX: Found an estimated cost of 4 for instruction: %shl
}
; CHECK: 'Cost Model Analysis' for function 'test10':
; SSE2: Found an estimated cost of 24 for instruction: %shl
-; SSE41: Found an estimated cost of 4 for instruction: %shl
+; SSE41: Found an estimated cost of 8 for instruction: %shl
; AVX: Found an estimated cost of 8 for instruction: %shl
; AVX2: Found an estimated cost of 2 for instruction: %shl
; XOPAVX: Found an estimated cost of 8 for instruction: %shl