From add9cc697a31fb39908c6193858c2dc5c054d218 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 18 Dec 2016 06:23:14 +0000 Subject: [PATCH] [AVX-512] Use EVEX encoded XOR instruction for zeroing scalar registers when DQI and VLX instructions are available. This can give the register allocator more registers to use. llvm-svn: 290057 --- llvm/lib/Target/X86/X86InstrAVX512.td | 10 ++++++++++ llvm/lib/Target/X86/X86InstrInfo.cpp | 13 ++++++++++--- llvm/lib/Target/X86/X86InstrSSE.td | 4 ++-- llvm/test/CodeGen/X86/avx512-scalar.ll | 26 +++++++++++++++++++++++++- 4 files changed, 47 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 03ba06d..69554ea 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -451,6 +451,16 @@ def AVX512_256_SET0 : I<0, Pseudo, (outs VR256X:$dst), (ins), "", [(set VR256X:$dst, (v8i32 immAllZerosV))]>; } +// Alias instructions that map fld0 to xorps for sse or vxorps for avx. +// This is expanded by ExpandPostRAPseudos. +let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, + isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasVLX, HasDQI] in { + def AVX512_FsFLD0SS : I<0, Pseudo, (outs FR32X:$dst), (ins), "", + [(set FR32X:$dst, fp32imm0)]>; + def AVX512_FsFLD0SD : I<0, Pseudo, (outs FR64X:$dst), (ins), "", + [(set FR64X:$dst, fpimm0)]>; +} + //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index e87fdbe..8481417 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -6824,6 +6824,9 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(X86::VPXORDZ256rr)); case X86::AVX512_512_SET0: return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); + case X86::AVX512_FsFLD0SS: + case X86::AVX512_FsFLD0SD: + return Expand2AddrUndef(MIB, get(X86::VXORPSZ128rr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: @@ -7664,9 +7667,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Alignment = 16; break; case X86::FsFLD0SD: + case X86::AVX512_FsFLD0SD: Alignment = 8; break; case X86::FsFLD0SS: + case X86::AVX512_FsFLD0SS: Alignment = 4; break; default: @@ -7703,7 +7708,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: case X86::FsFLD0SD: - case X86::FsFLD0SS: { + case X86::AVX512_FsFLD0SD: + case X86::FsFLD0SS: + case X86::AVX512_FsFLD0SS: { // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure. // Create a constant-pool entry and operands to load from it. @@ -7729,9 +7736,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineConstantPool &MCP = *MF.getConstantPool(); Type *Ty; unsigned Opc = LoadMI.getOpcode(); - if (Opc == X86::FsFLD0SS) + if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS) Ty = Type::getFloatTy(MF.getFunction()->getContext()); - else if (Opc == X86::FsFLD0SD) + else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) Ty = Type::getDoubleTy(MF.getFunction()->getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 734c8a8..0e82a1e 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -447,9 +447,9 @@ def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "", - [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1]>; + [(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoVLX_Or_NoDQI]>; def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "", - [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2]>; + [(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoVLX_Or_NoDQI]>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/X86/avx512-scalar.ll b/llvm/test/CodeGen/X86/avx512-scalar.ll index 644fda4..40442cf 100644 --- a/llvm/test/CodeGen/X86/avx512-scalar.ll +++ b/llvm/test/CodeGen/X86/avx512-scalar.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s --check-prefix AVX512 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s --check-prefix AVX512 --check-prefix AVX512-KNL +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s --check-prefix AVX512 --check-prefix AVX512-SKX ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx --show-mc-encoding | FileCheck %s --check-prefix AVX ; AVX512-LABEL: @test_fdiv @@ -90,3 +91,26 @@ define float @test_mov(float %a, float %b, float %i, float %j) { ret float %max } +; AVX512-SKX-LABEL: @zero_float +; AVX512-SKX: vxorps %xmm{{.*}}, %xmm{{.*}}, %xmm{{.*}} ## encoding: [0x62, +; AVX512-KNL-LABEL: @zero_float +; AVX512-KNL: vxorps %xmm{{.*}}, %xmm{{.*}}, %xmm{{.*}} ## encoding: [0xc5, +; AVX-LABEL: @zero_float +; AVX: vxorps %xmm{{.*}}, %xmm{{.*}}, %xmm{{.*}} ## encoding: [0xc5, + +define float @zero_float(float %a) { + %b = fadd float %a, 0.0 + ret float %b +} + +; AVX512-SKX-LABEL: @zero_double +; AVX512-SKX: vxorpd %xmm{{.*}}, %xmm{{.*}}, %xmm{{.*}} ## encoding: [0x62, +; AVX512-KNL-LABEL: @zero_double +; AVX512-KNL: vxorpd %xmm{{.*}}, %xmm{{.*}}, %xmm{{.*}} ## encoding: [0xc5, +; AVX-LABEL: @zero_double +; AVX: vxorpd %xmm{{.*}}, %xmm{{.*}}, %xmm{{.*}} ## encoding: [0xc5, + +define double @zero_double(double %a) { + %b = fadd double %a, 0.0 + ret double %b +} -- 2.7.4