ret <16 x i8> %res
}
+; CGP should allow a cross-block splat shift amount to be seen in SDAG.
+; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
+
+define void @sink_splatvar(i32* %p, i32 %shift_amt) {
+; SSE2-LABEL: sink_splatvar:
+; SSE2: # %bb.0: # %entry
+; SSE2-NEXT: movd %esi, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT: pslld $23, %xmm0
+; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE2-NEXT: .p2align 4, 0x90
+; SSE2-NEXT: .LBB8_1: # %loop
+; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE2-NEXT: pmuludq %xmm0, %xmm2
+; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; SSE2-NEXT: pmuludq %xmm1, %xmm3
+; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT: por %xmm4, %xmm2
+; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
+; SSE2-NEXT: addq $16, %rax
+; SSE2-NEXT: jne .LBB8_1
+; SSE2-NEXT: # %bb.2: # %end
+; SSE2-NEXT: retq
+;
+; SSE41-LABEL: sink_splatvar:
+; SSE41: # %bb.0: # %entry
+; SSE41-NEXT: movd %esi, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00
+; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT: pslld $23, %xmm0
+; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
+; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
+; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE41-NEXT: .p2align 4, 0x90
+; SSE41-NEXT: .LBB8_1: # %loop
+; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
+; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE41-NEXT: pmuludq %xmm1, %xmm3
+; SSE41-NEXT: pmuludq %xmm0, %xmm2
+; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; SSE41-NEXT: por %xmm4, %xmm3
+; SSE41-NEXT: movdqu %xmm3, 1024(%rdi,%rax)
+; SSE41-NEXT: addq $16, %rax
+; SSE41-NEXT: jne .LBB8_1
+; SSE41-NEXT: # %bb.2: # %end
+; SSE41-NEXT: retq
+;
+; AVX1-LABEL: sink_splatvar:
+; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vmovd %esi, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vpslld $23, %xmm0, %xmm0
+; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT: .p2align 4, 0x90
+; AVX1-NEXT: .LBB8_1: # %loop
+; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
+; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
+; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
+; AVX1-NEXT: addq $16, %rax
+; AVX1-NEXT: jne .LBB8_1
+; AVX1-NEXT: # %bb.2: # %end
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: sink_splatvar:
+; AVX2: # %bb.0: # %entry
+; AVX2-NEXT: vmovd %esi, %xmm0
+; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
+; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
+; AVX2-NEXT: vpsubd %xmm0, %xmm1, %xmm1
+; AVX2-NEXT: .p2align 4, 0x90
+; AVX2-NEXT: .LBB8_1: # %loop
+; AVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
+; AVX2-NEXT: vpsllvd %xmm0, %xmm2, %xmm3
+; AVX2-NEXT: vpsrlvd %xmm1, %xmm2, %xmm2
+; AVX2-NEXT: vpor %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
+; AVX2-NEXT: addq $16, %rax
+; AVX2-NEXT: jne .LBB8_1
+; AVX2-NEXT: # %bb.2: # %end
+; AVX2-NEXT: retq
+;
+; AVX512F-LABEL: sink_splatvar:
+; AVX512F: # %bb.0: # %entry
+; AVX512F-NEXT: vmovd %esi, %xmm0
+; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512F-NEXT: .p2align 4, 0x90
+; AVX512F-NEXT: .LBB8_1: # %loop
+; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512F-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
+; AVX512F-NEXT: vprolvd %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; AVX512F-NEXT: addq $16, %rax
+; AVX512F-NEXT: jne .LBB8_1
+; AVX512F-NEXT: # %bb.2: # %end
+; AVX512F-NEXT: vzeroupper
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: sink_splatvar:
+; AVX512VL: # %bb.0: # %entry
+; AVX512VL-NEXT: vpbroadcastd %esi, %xmm0
+; AVX512VL-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512VL-NEXT: .p2align 4, 0x90
+; AVX512VL-NEXT: .LBB8_1: # %loop
+; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512VL-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
+; AVX512VL-NEXT: vprolvd %xmm0, %xmm1, %xmm1
+; AVX512VL-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; AVX512VL-NEXT: addq $16, %rax
+; AVX512VL-NEXT: jne .LBB8_1
+; AVX512VL-NEXT: # %bb.2: # %end
+; AVX512VL-NEXT: retq
+;
+; AVX512BW-LABEL: sink_splatvar:
+; AVX512BW: # %bb.0: # %entry
+; AVX512BW-NEXT: vmovd %esi, %xmm0
+; AVX512BW-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512BW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512BW-NEXT: .p2align 4, 0x90
+; AVX512BW-NEXT: .LBB8_1: # %loop
+; AVX512BW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512BW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
+; AVX512BW-NEXT: vprolvd %zmm0, %zmm1, %zmm1
+; AVX512BW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; AVX512BW-NEXT: addq $16, %rax
+; AVX512BW-NEXT: jne .LBB8_1
+; AVX512BW-NEXT: # %bb.2: # %end
+; AVX512BW-NEXT: vzeroupper
+; AVX512BW-NEXT: retq
+;
+; AVX512VBMI2-LABEL: sink_splatvar:
+; AVX512VBMI2: # %bb.0: # %entry
+; AVX512VBMI2-NEXT: vmovd %esi, %xmm0
+; AVX512VBMI2-NEXT: vpbroadcastd %xmm0, %xmm0
+; AVX512VBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512VBMI2-NEXT: .p2align 4, 0x90
+; AVX512VBMI2-NEXT: .LBB8_1: # %loop
+; AVX512VBMI2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512VBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
+; AVX512VBMI2-NEXT: vprolvd %zmm0, %zmm1, %zmm1
+; AVX512VBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; AVX512VBMI2-NEXT: addq $16, %rax
+; AVX512VBMI2-NEXT: jne .LBB8_1
+; AVX512VBMI2-NEXT: # %bb.2: # %end
+; AVX512VBMI2-NEXT: vzeroupper
+; AVX512VBMI2-NEXT: retq
+;
+; AVX512VLBW-LABEL: sink_splatvar:
+; AVX512VLBW: # %bb.0: # %entry
+; AVX512VLBW-NEXT: vpbroadcastd %esi, %xmm0
+; AVX512VLBW-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512VLBW-NEXT: .p2align 4, 0x90
+; AVX512VLBW-NEXT: .LBB8_1: # %loop
+; AVX512VLBW-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512VLBW-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
+; AVX512VLBW-NEXT: vprolvd %xmm0, %xmm1, %xmm1
+; AVX512VLBW-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; AVX512VLBW-NEXT: addq $16, %rax
+; AVX512VLBW-NEXT: jne .LBB8_1
+; AVX512VLBW-NEXT: # %bb.2: # %end
+; AVX512VLBW-NEXT: retq
+;
+; AVX512VLVBMI2-LABEL: sink_splatvar:
+; AVX512VLVBMI2: # %bb.0: # %entry
+; AVX512VLVBMI2-NEXT: vpbroadcastd %esi, %xmm0
+; AVX512VLVBMI2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; AVX512VLVBMI2-NEXT: .p2align 4, 0x90
+; AVX512VLVBMI2-NEXT: .LBB8_1: # %loop
+; AVX512VLVBMI2-NEXT: # =>This Inner Loop Header: Depth=1
+; AVX512VLVBMI2-NEXT: vmovdqu 1024(%rdi,%rax), %xmm1
+; AVX512VLVBMI2-NEXT: vprolvd %xmm0, %xmm1, %xmm1
+; AVX512VLVBMI2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; AVX512VLVBMI2-NEXT: addq $16, %rax
+; AVX512VLVBMI2-NEXT: jne .LBB8_1
+; AVX512VLVBMI2-NEXT: # %bb.2: # %end
+; AVX512VLVBMI2-NEXT: retq
+;
+; XOPAVX1-LABEL: sink_splatvar:
+; XOPAVX1: # %bb.0: # %entry
+; XOPAVX1-NEXT: vmovd %esi, %xmm0
+; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; XOPAVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
+; XOPAVX1-NEXT: .p2align 4, 0x90
+; XOPAVX1-NEXT: .LBB8_1: # %loop
+; XOPAVX1-NEXT: # =>This Inner Loop Header: Depth=1
+; XOPAVX1-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1
+; XOPAVX1-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; XOPAVX1-NEXT: addq $16, %rax
+; XOPAVX1-NEXT: jne .LBB8_1
+; XOPAVX1-NEXT: # %bb.2: # %end
+; XOPAVX1-NEXT: retq
+;
+; XOPAVX2-LABEL: sink_splatvar:
+; XOPAVX2: # %bb.0: # %entry
+; XOPAVX2-NEXT: vmovd %esi, %xmm0
+; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0
+; XOPAVX2-NEXT: movq $-1024, %rax # imm = 0xFC00
+; XOPAVX2-NEXT: .p2align 4, 0x90
+; XOPAVX2-NEXT: .LBB8_1: # %loop
+; XOPAVX2-NEXT: # =>This Inner Loop Header: Depth=1
+; XOPAVX2-NEXT: vprotd %xmm0, 1024(%rdi,%rax), %xmm1
+; XOPAVX2-NEXT: vmovdqu %xmm1, 1024(%rdi,%rax)
+; XOPAVX2-NEXT: addq $16, %rax
+; XOPAVX2-NEXT: jne .LBB8_1
+; XOPAVX2-NEXT: # %bb.2: # %end
+; XOPAVX2-NEXT: retq
+;
+; X32-SSE-LABEL: sink_splatvar:
+; X32-SSE: # %bb.0: # %entry
+; X32-SSE-NEXT: pushl %esi
+; X32-SSE-NEXT: .cfi_def_cfa_offset 8
+; X32-SSE-NEXT: .cfi_offset %esi, -8
+; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-SSE-NEXT: xorl %ecx, %ecx
+; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: pslld $23, %xmm0
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: cvttps2dq %xmm0, %xmm0
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; X32-SSE-NEXT: xorl %edx, %edx
+; X32-SSE-NEXT: .p2align 4, 0x90
+; X32-SSE-NEXT: .LBB8_1: # %loop
+; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1
+; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; X32-SSE-NEXT: pmuludq %xmm0, %xmm2
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; X32-SSE-NEXT: pmuludq %xmm1, %xmm3
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X32-SSE-NEXT: por %xmm4, %xmm2
+; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4)
+; X32-SSE-NEXT: addl $4, %ecx
+; X32-SSE-NEXT: adcl $0, %edx
+; X32-SSE-NEXT: movl %ecx, %esi
+; X32-SSE-NEXT: xorl $256, %esi # imm = 0x100
+; X32-SSE-NEXT: orl %edx, %esi
+; X32-SSE-NEXT: jne .LBB8_1
+; X32-SSE-NEXT: # %bb.2: # %end
+; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: .cfi_def_cfa_offset 4
+; X32-SSE-NEXT: retl
+entry:
+ %ins = insertelement <4 x i32> undef, i32 %shift_amt, i32 0
+ %splat = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
+ br label %loop
+
+loop:
+ %index = phi i64 [ 0, %entry ], [ %inc, %loop ]
+ %addr = getelementptr inbounds i32, i32* %p, i64 %index
+ %addr_vec = bitcast i32* %addr to <4 x i32>*
+ %x = load <4 x i32>, <4 x i32>* %addr_vec, align 4
+ %fsh = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat)
+ store <4 x i32> %fsh, <4 x i32>* %addr_vec, align 4
+ %inc = add i64 %index, 4
+ %iv = icmp eq i64 %inc, 256
+ br i1 %iv, label %end, label %loop
+
+end:
+ ret void
+}
+
;
; Constant Shifts
;