llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
}
-// Vector extract and insert
-let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
- def int_x86_avx_vextractf128_pd_256 :
- GCCBuiltin<"__builtin_ia32_vextractf128_pd256">,
- Intrinsic<[llvm_v2f64_ty], [llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx_vextractf128_ps_256 :
- GCCBuiltin<"__builtin_ia32_vextractf128_ps256">,
- Intrinsic<[llvm_v4f32_ty], [llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
- def int_x86_avx_vextractf128_si_256 :
- GCCBuiltin<"__builtin_ia32_vextractf128_si256">,
- Intrinsic<[llvm_v4i32_ty], [llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>;
-}
-
// Vector convert
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_cvtdq2_pd_256 : GCCBuiltin<"__builtin_ia32_cvtdq2pd256">,
setValue(&I, Res);
return nullptr;
}
- case Intrinsic::x86_avx_vextractf128_pd_256:
- case Intrinsic::x86_avx_vextractf128_ps_256:
- case Intrinsic::x86_avx_vextractf128_si_256:
case Intrinsic::x86_avx2_vextracti128: {
EVT DestVT = TLI.getValueType(I.getType());
uint64_t Idx = (cast<ConstantInt>(I.getArgOperand(1))->getZExtValue() & 1) *
Name == "x86.avx.vinsertf128.pd.256" ||
Name == "x86.avx.vinsertf128.ps.256" ||
Name == "x86.avx.vinsertf128.si.256" ||
+ Name == "x86.avx.vextractf128.pd.256" ||
+ Name == "x86.avx.vextractf128.ps.256" ||
+ Name == "x86.avx.vextractf128.si.256" ||
Name == "x86.avx.movnt.dq.256" ||
Name == "x86.avx.movnt.pd.256" ||
Name == "x86.avx.movnt.ps.256" ||
Idxs2.push_back(Builder.getInt32(Idx));
}
Rep = Builder.CreateShuffleVector(Op0, Rep, ConstantVector::get(Idxs2));
+ } else if (Name == "llvm.x86.avx.vextractf128.pd.256" ||
+ Name == "llvm.x86.avx.vextractf128.ps.256" ||
+ Name == "llvm.x86.avx.vextractf128.si.256") {
+ Value *Op0 = CI->getArgOperand(0);
+ unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+ VectorType *VecTy = cast<VectorType>(CI->getType());
+ unsigned NumElts = VecTy->getNumElements();
+
+ // Mask off the high bits of the immediate value; hardware ignores those.
+ Imm = Imm & 1;
+
+ // Get indexes for either the high half or low half of the input vector.
+ SmallVector<Constant*, 4> Idxs(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ unsigned Idx = Imm ? (i + NumElts) : i;
+ Idxs[i] = Builder.getInt32(Idx);
+ }
+
+ Value *UndefV = UndefValue::get(Op0->getType());
+ Rep = Builder.CreateShuffleVector(Op0, UndefV, ConstantVector::get(Idxs));
} else {
bool PD128 = false, PD256 = false, PS128 = false, PS256 = false;
if (Name == "llvm.x86.avx.vpermil.pd.256")
}
declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
+; We don't check any vextractf128 variant with immediate 0 because that's just a move.
+
+define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
+; CHECK: vextractf128 $1, %ymm0, %xmm0
+ %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
+ ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
+
+define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
+; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
+; CHECK: vextractf128 $1, %ymm0, %xmm0
+ %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
+ ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
+
+define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
+; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
+; CHECK: vextractf128 $1, %ymm0, %xmm0
+ %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
+ ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
+
+; Verify that high bits of the immediate are masked off. This should be the equivalent
+; of a vextractf128 $0 which should be optimized away, so just check that it's
+; not a vextractf128 of any kind.
+define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
+; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
+; CHECK-NOT: vextractf128
+ %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
+ ret <2 x double> %res
+}
+
+
define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK-LABEL: test_x86_avx_blend_pd_256:
; CHECK: vblendpd
declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
-define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) {
- ; CHECK: vextractf128
- %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]
- ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_avx_vextractf128_ps_256(<8 x float> %a0) {
- ; CHECK: vextractf128
- %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
-
-
-define <4 x i32> @test_x86_avx_vextractf128_si_256(<8 x i32> %a0) {
- ; CHECK: vextractf128
- %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 7) ; <<4 x i32>> [#uses=1]
- ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
-
-
define <4 x double> @test_x86_avx_vperm2f128_pd_256(<4 x double> %a0, <4 x double> %a1) {
; CHECK: vperm2f128
%res = call <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double> %a0, <4 x double> %a1, i8 7) ; <<4 x double>> [#uses=1]