From f31b0b850bb03bf53c9fd60c929bcf7462554526 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 23 Nov 2017 18:41:21 +0000 Subject: [PATCH] [X86] Teach isel that X86ISD::CMPM_RND zeros the upper bits of the mask register. llvm-svn: 318933 --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 3 +- llvm/lib/Target/X86/X86ISelLowering.cpp | 1 + llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll | 116 ++++++++++++++++++++++- 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 504482a..93a3b928 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -456,7 +456,8 @@ static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM || Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM || - Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU) { + Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU || + Opcode == X86ISD::CMPM_RND) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index eabaad8..c0d8504 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5013,6 +5013,7 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { case X86ISD::PCMPGTM: case X86ISD::CMPM: case X86ISD::CMPMU: + case X86ISD::CMPM_RND: return true; } } diff --git a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll index 26a7d83..673e442 100644 --- a/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll +++ b/llvm/test/CodeGen/X86/avx512vl-vec-masked-cmp.ll @@ -58076,5 +58076,117 @@ entry: ret i64 %3 } - - +; Test that we understand that cmpps with rounding zeros the upper bits of the mask register. +define i32 @test_cmpm_rnd_zero(<16 x float> %a, <16 x float> %b) { +; VLX-LABEL: test_cmpm_rnd_zero: +; VLX: # BB#0: +; VLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; VLX-NEXT: kmovd %k0, %eax +; VLX-NEXT: vzeroupper +; VLX-NEXT: retq +; +; NoVLX-LABEL: test_cmpm_rnd_zero: +; NoVLX: # BB#0: +; NoVLX-NEXT: pushq %rbp +; NoVLX-NEXT: .cfi_def_cfa_offset 16 +; NoVLX-NEXT: .cfi_offset %rbp, -16 +; NoVLX-NEXT: movq %rsp, %rbp +; NoVLX-NEXT: .cfi_def_cfa_register %rbp +; NoVLX-NEXT: pushq %r15 +; NoVLX-NEXT: pushq %r14 +; NoVLX-NEXT: pushq %r13 +; NoVLX-NEXT: pushq %r12 +; NoVLX-NEXT: pushq %rbx +; NoVLX-NEXT: andq $-32, %rsp +; NoVLX-NEXT: subq $32, %rsp +; NoVLX-NEXT: .cfi_offset %rbx, -56 +; NoVLX-NEXT: .cfi_offset %r12, -48 +; NoVLX-NEXT: .cfi_offset %r13, -40 +; NoVLX-NEXT: .cfi_offset %r14, -32 +; NoVLX-NEXT: .cfi_offset %r15, -24 +; NoVLX-NEXT: vcmpleps {sae}, %zmm1, %zmm0, %k0 +; NoVLX-NEXT: kxorw %k0, %k0, %k1 +; NoVLX-NEXT: kmovw %k1, {{[0-9]+}}(%rsp) +; NoVLX-NEXT: kshiftlw $14, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r8d +; NoVLX-NEXT: kshiftlw $15, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $13, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r9d +; NoVLX-NEXT: kshiftlw $12, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r11d +; NoVLX-NEXT: kshiftlw $11, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r14d +; NoVLX-NEXT: kshiftlw $10, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r15d +; NoVLX-NEXT: kshiftlw $9, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r12d +; NoVLX-NEXT: kshiftlw $8, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %r13d +; NoVLX-NEXT: kshiftlw $7, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %esi +; NoVLX-NEXT: kshiftlw $6, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %ebx +; NoVLX-NEXT: kshiftlw $5, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edi +; NoVLX-NEXT: kshiftlw $4, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %eax +; NoVLX-NEXT: kshiftlw $3, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: kmovw %k1, %edx +; NoVLX-NEXT: kshiftlw $2, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vmovd %r10d, %xmm0 +; NoVLX-NEXT: kmovw %k1, %r10d +; NoVLX-NEXT: kshiftlw $1, %k0, %k1 +; NoVLX-NEXT: kshiftrw $15, %k1, %k1 +; NoVLX-NEXT: vpinsrb $1, %r8d, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k1, %ecx +; NoVLX-NEXT: vpinsrb $2, %r9d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $3, %r11d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $4, %r14d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $5, %r15d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $6, %r12d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $7, %r13d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $8, %esi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $9, %ebx, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $10, %edi, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $12, %edx, %xmm0, %xmm0 +; NoVLX-NEXT: kshiftrw $15, %k0, %k0 +; NoVLX-NEXT: vpinsrb $13, %r10d, %xmm0, %xmm0 +; NoVLX-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 +; NoVLX-NEXT: kmovw %k0, %eax +; NoVLX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; NoVLX-NEXT: vpmovsxbd %xmm0, %zmm0 +; NoVLX-NEXT: vpslld $31, %zmm0, %zmm0 +; NoVLX-NEXT: vptestmd %zmm0, %zmm0, %k0 +; NoVLX-NEXT: kmovw %k0, (%rsp) +; NoVLX-NEXT: movl (%rsp), %eax +; NoVLX-NEXT: leaq -40(%rbp), %rsp +; NoVLX-NEXT: popq %rbx +; NoVLX-NEXT: popq %r12 +; NoVLX-NEXT: popq %r13 +; NoVLX-NEXT: popq %r14 +; NoVLX-NEXT: popq %r15 +; NoVLX-NEXT: popq %rbp +; NoVLX-NEXT: vzeroupper +; NoVLX-NEXT: retq + %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8) + %cast = bitcast i16 %res to <16 x i1> + %shuffle = shufflevector <16 x i1> %cast, <16 x i1> zeroinitializer, <32 x i32> + %cast2 = bitcast <32 x i1> %shuffle to i32 + ret i32 %cast2 +} -- 2.7.4