From 0e533ca4bb82fee0b1c25853129a2f5f80dbf97b Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 10 Sep 2019 05:49:53 +0000 Subject: [PATCH] [X86] Add broadcast load unfolding support for VCMPPS/PD. llvm-svn: 371487 --- llvm/lib/Target/X86/X86InstrFoldTables.cpp | 6 +++ llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll | 55 +++++++++++++----------- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp index 07bbf4c..0f89a80 100644 --- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -5252,6 +5252,12 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable2[] = { { X86::VADDPSZ128rr, X86::VADDPSZ128rmb, TB_BCAST_SS }, { X86::VADDPSZ256rr, X86::VADDPSZ256rmb, TB_BCAST_SS }, { X86::VADDPSZrr, X86::VADDPSZrmb, TB_BCAST_SS }, + { X86::VCMPPDZ128rri, X86::VCMPPDZ128rmbi, TB_BCAST_SD }, + { X86::VCMPPDZ256rri, X86::VCMPPDZ256rmbi, TB_BCAST_SD }, + { X86::VCMPPDZrri, X86::VCMPPDZrmbi, TB_BCAST_SD }, + { X86::VCMPPSZ128rri, X86::VCMPPSZ128rmbi, TB_BCAST_SS }, + { X86::VCMPPSZ256rri, X86::VCMPPSZ256rmbi, TB_BCAST_SS }, + { X86::VCMPPSZrri, X86::VCMPPSZrmbi, TB_BCAST_SS }, { X86::VDIVPDZ128rr, X86::VDIVPDZ128rmb, TB_BCAST_SD }, { X86::VDIVPDZ256rr, X86::VDIVPDZ256rmb, TB_BCAST_SD }, { X86::VDIVPDZrr, X86::VDIVPDZrmb, TB_BCAST_SD }, diff --git a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll index a798102..fb097dd 100644 --- a/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll +++ b/llvm/test/CodeGen/X86/avx512-broadcast-unfold.ll @@ -4227,14 +4227,15 @@ define void @bcast_unfold_cmp_v4f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v4f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB120_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm1 -; CHECK-NEXT: vcmpltps {{.*}}(%rip){1to4}, %xmm1, %k1 -; CHECK-NEXT: vblendmps %xmm1, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovups %xmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %xmm2 +; CHECK-NEXT: vcmpltps %xmm0, %xmm2, %k1 +; CHECK-NEXT: vblendmps %xmm2, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vmovups %xmm2, 4096(%rdi,%rax) ; CHECK-NEXT: addq $16, %rax ; CHECK-NEXT: jne .LBB120_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4263,14 +4264,15 @@ define void @bcast_unfold_cmp_v8f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v8f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB121_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm1 -; CHECK-NEXT: vcmpltps {{.*}}(%rip){1to8}, %ymm1, %k1 -; CHECK-NEXT: vblendmps %ymm1, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovups %ymm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %ymm2 +; CHECK-NEXT: vcmpltps %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmps %ymm2, %ymm1, %ymm2 {%k1} +; CHECK-NEXT: vmovups %ymm2, 4096(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB121_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4300,14 +4302,15 @@ define void @bcast_unfold_cmp_v16f32(float* %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v16f32: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-4096, %rax # imm = 0xF000 -; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; CHECK-NEXT: vbroadcastss {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB122_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm1 -; CHECK-NEXT: vcmpltps {{.*}}(%rip){1to16}, %zmm1, %k1 -; CHECK-NEXT: vblendmps %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovups %zmm1, 4096(%rdi,%rax) +; CHECK-NEXT: vmovups 4096(%rdi,%rax), %zmm2 +; CHECK-NEXT: vcmpltps %zmm0, %zmm2, %k1 +; CHECK-NEXT: vblendmps %zmm2, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vmovups %zmm2, 4096(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB122_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4374,14 +4377,15 @@ define void @bcast_unfold_cmp_v4f64(double* %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v4f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; CHECK-NEXT: vbroadcastsd {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB124_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm1 -; CHECK-NEXT: vcmpltpd {{.*}}(%rip){1to4}, %ymm1, %k1 -; CHECK-NEXT: vblendmpd %ymm1, %ymm0, %ymm1 {%k1} -; CHECK-NEXT: vmovupd %ymm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %ymm2 +; CHECK-NEXT: vcmpltpd %ymm0, %ymm2, %k1 +; CHECK-NEXT: vblendmpd %ymm2, %ymm1, %ymm2 {%k1} +; CHECK-NEXT: vmovupd %ymm2, 8192(%rdi,%rax) ; CHECK-NEXT: addq $32, %rax ; CHECK-NEXT: jne .LBB124_1 ; CHECK-NEXT: # %bb.2: # %bb10 @@ -4411,14 +4415,15 @@ define void @bcast_unfold_cmp_v8f64(double* %arg) { ; CHECK-LABEL: bcast_unfold_cmp_v8f64: ; CHECK: # %bb.0: # %bb ; CHECK-NEXT: movq $-8192, %rax # imm = 0xE000 -; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0] +; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0] ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB125_1: # %bb1 ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm1 -; CHECK-NEXT: vcmpltpd {{.*}}(%rip){1to8}, %zmm1, %k1 -; CHECK-NEXT: vblendmpd %zmm1, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovupd %zmm1, 8192(%rdi,%rax) +; CHECK-NEXT: vmovupd 8192(%rdi,%rax), %zmm2 +; CHECK-NEXT: vcmpltpd %zmm0, %zmm2, %k1 +; CHECK-NEXT: vblendmpd %zmm2, %zmm1, %zmm2 {%k1} +; CHECK-NEXT: vmovupd %zmm2, 8192(%rdi,%rax) ; CHECK-NEXT: addq $64, %rax ; CHECK-NEXT: jne .LBB125_1 ; CHECK-NEXT: # %bb.2: # %bb10 -- 2.7.4