From c4f2b0996d825cecd3b6150c51a4da1e0bb5bbf0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 9 Dec 2016 05:20:11 +0000 Subject: [PATCH] [X86] Add masked versions of VPERMT2* and VPERMI2* to load folding tables. llvm-svn: 289186 --- llvm/lib/Target/X86/X86InstrInfo.cpp | 90 ++++++++++++++++++++++-- llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll | 28 ++++++++ 2 files changed, 112 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 7f88f8a..1679f99 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2675,11 +2675,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZrrk, X86::VPANDQZrmk, 0 }, { X86::VPERMBZrrk, X86::VPERMBZrmk, 0 }, { X86::VPERMDZrrk, X86::VPERMDZrmk, 0 }, + { X86::VPERMI2Brrk, X86::VPERMI2Brmk, 0 }, + { X86::VPERMI2Drrk, X86::VPERMI2Drmk, 0 }, + { X86::VPERMI2PSrrk, X86::VPERMI2PSrmk, 0 }, + { X86::VPERMI2PDrrk, X86::VPERMI2PDrmk, 0 }, + { X86::VPERMI2Qrrk, X86::VPERMI2Qrmk, 0 }, + { X86::VPERMI2Wrrk, X86::VPERMI2Wrmk, 0 }, { X86::VPERMILPDZrrk, X86::VPERMILPDZrmk, 0 }, { X86::VPERMILPSZrrk, X86::VPERMILPSZrmk, 0 }, { X86::VPERMPDZrrk, X86::VPERMPDZrmk, 0 }, { X86::VPERMPSZrrk, X86::VPERMPSZrmk, 0 }, { X86::VPERMQZrrk, X86::VPERMQZrmk, 0 }, + { X86::VPERMT2Brrk, X86::VPERMT2Brmk, 0 }, + { X86::VPERMT2Drrk, X86::VPERMT2Drmk, 0 }, + { X86::VPERMT2PSrrk, X86::VPERMT2PSrmk, 0 }, + { X86::VPERMT2PDrrk, X86::VPERMT2PDrmk, 0 }, + { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 }, + { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 }, { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, @@ -2694,9 +2706,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSBZrrk, X86::VPSUBUSBZrmk, 0 }, { X86::VPSUBUSWZrrk, X86::VPSUBUSWZrmk, 0 }, { X86::VPTERNLOGDZrrik, X86::VPTERNLOGDZrmik, 0 }, - { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 }, - { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 }, { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 }, { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 }, @@ -2755,11 +2765,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDQZ256rrk, X86::VPANDQZ256rmk, 0 }, { X86::VPERMBZ256rrk, X86::VPERMBZ256rmk, 0 }, { X86::VPERMDZ256rrk, X86::VPERMDZ256rmk, 0 }, + { X86::VPERMI2B256rrk, X86::VPERMI2B256rmk, 0 }, + { X86::VPERMI2D256rrk, X86::VPERMI2D256rmk, 0 }, + { X86::VPERMI2PD256rrk, X86::VPERMI2PD256rmk, 0 }, + { X86::VPERMI2PS256rrk, X86::VPERMI2PS256rmk, 0 }, + { X86::VPERMI2Q256rrk, X86::VPERMI2Q256rmk, 0 }, + { X86::VPERMI2W256rrk, X86::VPERMI2W256rmk, 0 }, { X86::VPERMILPDZ256rrk, X86::VPERMILPDZ256rmk, 0 }, { X86::VPERMILPSZ256rrk, X86::VPERMILPSZ256rmk, 0 }, { X86::VPERMPDZ256rrk, X86::VPERMPDZ256rmk, 0 }, { X86::VPERMPSZ256rrk, X86::VPERMPSZ256rmk, 0 }, { X86::VPERMQZ256rrk, X86::VPERMQZ256rmk, 0 }, + { X86::VPERMT2B256rrk, X86::VPERMT2B256rmk, 0 }, + { X86::VPERMT2D256rrk, X86::VPERMT2D256rmk, 0 }, + { X86::VPERMT2PD256rrk, X86::VPERMT2PD256rmk, 0 }, + { X86::VPERMT2PS256rrk, X86::VPERMT2PS256rmk, 0 }, + { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 }, + { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 }, { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, @@ -2775,9 +2797,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSWZ256rrk, X86::VPSUBUSWZ256rmk, 0 }, { X86::VPSUBWZ256rrk, X86::VPSUBWZ256rmk, 0 }, { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik, 0 }, - { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 }, - { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 }, { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 }, { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 }, @@ -2831,8 +2851,20 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPANDNQZ128rrk, X86::VPANDNQZ128rmk, 0 }, { X86::VPANDQZ128rrk, X86::VPANDQZ128rmk, 0 }, { X86::VPERMBZ128rrk, X86::VPERMBZ128rmk, 0 }, + { X86::VPERMI2B128rrk, X86::VPERMI2B128rmk, 0 }, + { X86::VPERMI2D128rrk, X86::VPERMI2D128rmk, 0 }, + { X86::VPERMI2PD128rrk, X86::VPERMI2PD128rmk, 0 }, + { X86::VPERMI2PS128rrk, X86::VPERMI2PS128rmk, 0 }, + { X86::VPERMI2Q128rrk, X86::VPERMI2Q128rmk, 0 }, + { X86::VPERMI2W128rrk, X86::VPERMI2W128rmk, 0 }, { X86::VPERMILPDZ128rrk, X86::VPERMILPDZ128rmk, 0 }, { X86::VPERMILPSZ128rrk, X86::VPERMILPSZ128rmk, 0 }, + { X86::VPERMT2B128rrk, X86::VPERMT2B128rmk, 0 }, + { X86::VPERMT2D128rrk, X86::VPERMT2D128rmk, 0 }, + { X86::VPERMT2PD128rrk, X86::VPERMT2PD128rmk, 0 }, + { X86::VPERMT2PS128rrk, X86::VPERMT2PS128rmk, 0 }, + { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 }, + { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 }, { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, @@ -2848,9 +2880,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSWZ128rrk, X86::VPSUBUSWZ128rmk, 0 }, { X86::VPSUBWZ128rrk, X86::VPSUBWZ128rmk, 0 }, { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik, 0 }, - { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 }, - { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 }, { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 }, { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 }, @@ -2869,6 +2899,54 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 }, { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 }, { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 }, + + // 512-bit three source instructions with zero masking. + { X86::VPERMI2Brrkz, X86::VPERMI2Brmkz, 0 }, + { X86::VPERMI2Drrkz, X86::VPERMI2Drmkz, 0 }, + { X86::VPERMI2PSrrkz, X86::VPERMI2PSrmkz, 0 }, + { X86::VPERMI2PDrrkz, X86::VPERMI2PDrmkz, 0 }, + { X86::VPERMI2Qrrkz, X86::VPERMI2Qrmkz, 0 }, + { X86::VPERMI2Wrrkz, X86::VPERMI2Wrmkz, 0 }, + { X86::VPERMT2Brrkz, X86::VPERMT2Brmkz, 0 }, + { X86::VPERMT2Drrkz, X86::VPERMT2Drmkz, 0 }, + { X86::VPERMT2PSrrkz, X86::VPERMT2PSrmkz, 0 }, + { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 }, + { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 }, + { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 }, + { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, + { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, + + // 256-bit three source instructions with zero masking. + { X86::VPERMI2B256rrkz, X86::VPERMI2B256rmkz, 0 }, + { X86::VPERMI2D256rrkz, X86::VPERMI2D256rmkz, 0 }, + { X86::VPERMI2PD256rrkz, X86::VPERMI2PD256rmkz, 0 }, + { X86::VPERMI2PS256rrkz, X86::VPERMI2PS256rmkz, 0 }, + { X86::VPERMI2Q256rrkz, X86::VPERMI2Q256rmkz, 0 }, + { X86::VPERMI2W256rrkz, X86::VPERMI2W256rmkz, 0 }, + { X86::VPERMT2B256rrkz, X86::VPERMT2B256rmkz, 0 }, + { X86::VPERMT2D256rrkz, X86::VPERMT2D256rmkz, 0 }, + { X86::VPERMT2PD256rrkz, X86::VPERMT2PD256rmkz, 0 }, + { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 }, + { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 }, + { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 }, + { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, + { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, + + // 128-bit three source instructions with zero masking. + { X86::VPERMI2B128rrkz, X86::VPERMI2B128rmkz, 0 }, + { X86::VPERMI2D128rrkz, X86::VPERMI2D128rmkz, 0 }, + { X86::VPERMI2PD128rrkz, X86::VPERMI2PD128rmkz, 0 }, + { X86::VPERMI2PS128rrkz, X86::VPERMI2PS128rmkz, 0 }, + { X86::VPERMI2Q128rrkz, X86::VPERMI2Q128rmkz, 0 }, + { X86::VPERMI2W128rrkz, X86::VPERMI2W128rmkz, 0 }, + { X86::VPERMT2B128rrkz, X86::VPERMT2B128rmkz, 0 }, + { X86::VPERMT2D128rrkz, X86::VPERMT2D128rmkz, 0 }, + { X86::VPERMT2PD128rrkz, X86::VPERMT2PD128rmkz, 0 }, + { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 }, + { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 }, + { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 }, + { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, + { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, }; for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) { diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll index 4a1ad4b..4d5e8c9 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll @@ -534,6 +534,34 @@ define <16 x float> @stack_fold_vpermi2ps(<16 x i32> %x0, <16 x float> %x1, <16 } declare <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) +define <16 x float> @stack_fold_vpermi2ps_mask(<16 x float> %x0, <16 x i32>* %x1, <16 x float> %x2, i16 %mask) { + ;CHECK-LABEL: stack_fold_vpermi2ps_mask + ;CHECK: vpermi2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %x1b = load <16 x i32>, <16 x i32>* %x1 + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1b, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @stack_fold_vpermt2ps_mask(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) { + ;CHECK-LABEL: stack_fold_vpermt2ps_mask + ;CHECK: vpermt2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %x0b = load <16 x i32>, <16 x i32>* %x0 + %res = call <16 x float> @llvm.x86.avx512.mask.vpermt2var.ps.512(<16 x i32> %x0b, <16 x float> %x1, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @stack_fold_vpermt2ps_maskz(<16 x i32>* %x0, <16 x float> %x1, <16 x float> %x2, i16 %mask) { + ;CHECK-LABEL: stack_fold_vpermt2ps_maskz + ;CHECK: vpermt2ps {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %x0b = load <16 x i32>, <16 x i32>* %x0 + %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0b, <16 x float> %x1, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) + define <8 x double> @stack_fold_vpermt2pd(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { ;CHECK-LABEL: stack_fold_vpermt2pd ;CHECK: vpermt2pd {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload -- 2.7.4