On AVX1 targets we can handle v4i64 logical shifts by 32 bits as a pair of v8f32 shuffles with zero.
I was hoping to put this in LowerScalarImmediateShift, but performing that early causes regressions where other instructions were respliting the subvectors.
}
break;
case X86ISD::VSHLI:
- case X86ISD::VSRAI:
case X86ISD::VSRLI:
+ // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle.
+ // TODO: Move this to LowerScalarImmediateShift?
+ if (VT == MVT::v4i64 && !Subtarget.hasInt256() &&
+ llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op.getConstantOperandAPInt(1) == 32;
+ })) {
+ SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0));
+ SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL);
+ if (Op0.getOpcode() == X86ISD::VSHLI) {
+ Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+ {8, 0, 8, 2, 8, 4, 8, 6});
+ } else {
+ Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero,
+ {1, 8, 3, 8, 5, 8, 7, 8});
+ }
+ return DAG.getBitcast(VT, Res);
+ }
+ LLVM_FALLTHROUGH;
+ case X86ISD::VSRAI:
if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
(VT.is512BitVector() && Subtarget.useAVX512Regs() &&
(EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
; AVX1-LABEL: uitofp_4i64_to_4f64:
; AVX1: # %bb.0:
; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX1-NEXT: vorps {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_4i64_to_4f64:
;
; AVX1-LABEL: uitofp_load_4i64_to_4f64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
+; AVX1-NEXT: vmovaps (%rdi), %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT: vorps {{.*}}(%rip), %ymm2, %ymm2
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[2,0,3,1,6,4,7,5]
; AVX1-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT: vorpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vaddpd %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vsubpd {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: uitofp_load_4i64_to_4f64:
define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: shift32_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shift32_v4i64:
;
; XOPAVX1-LABEL: shift32_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shift32_v4i64:
;
; X86-AVX1-LABEL: shift32_v4i64:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT: vpsrlq $32, %xmm0, %xmm0
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: shift32_v4i64:
define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
; AVX1-LABEL: shift32_v4i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shift32_v4i64:
;
; XOPAVX1-LABEL: shift32_v4i64:
; XOPAVX1: # %bb.0:
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; XOPAVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; XOPAVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; XOPAVX1-NEXT: retq
;
; XOPAVX2-LABEL: shift32_v4i64:
;
; X86-AVX1-LABEL: shift32_v4i64:
; X86-AVX1: # %bb.0:
-; X86-AVX1-NEXT: vpsllq $32, %xmm0, %xmm1
-; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT: vpsllq $32, %xmm0, %xmm0
-; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
+; X86-AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
; X86-AVX1-NEXT: retl
;
; X86-AVX2-LABEL: shift32_v4i64: