From 4fab487265a1014485acab935c75f6e584282ffc Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 27 Nov 2016 19:51:41 +0000 Subject: [PATCH] [AVX-512] Add integer and fp unpck instructions to load folding tables. llvm-svn: 288004 --- llvm/lib/Target/X86/X86InstrInfo.cpp | 108 +++++++++++++++++++++ llvm/test/CodeGen/X86/stack-folding-int-avx512.ll | 30 ++++++ .../test/CodeGen/X86/stack-folding-int-avx512vl.ll | 60 ++++++++++++ llvm/test/CodeGen/X86/vec_fp_to_int.ll | 8 +- 4 files changed, 202 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index aaffca0..e9d544a 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -1874,6 +1874,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSBZrr, X86::VPSUBUSBZrm, 0 }, { X86::VPSUBUSWZrr, X86::VPSUBUSWZrm, 0 }, { X86::VPSUBWZrr, X86::VPSUBWZrm, 0 }, + { X86::VPUNPCKHBWZrr, X86::VPUNPCKHBWZrm, 0 }, + { X86::VPUNPCKHDQZrr, X86::VPUNPCKHDQZrm, 0 }, + { X86::VPUNPCKHQDQZrr, X86::VPUNPCKHQDQZrm, 0 }, + { X86::VPUNPCKHWDZrr, X86::VPUNPCKHWDZrm, 0 }, + { X86::VPUNPCKLBWZrr, X86::VPUNPCKLBWZrm, 0 }, + { X86::VPUNPCKLDQZrr, X86::VPUNPCKLDQZrm, 0 }, + { X86::VPUNPCKLQDQZrr, X86::VPUNPCKLQDQZrm, 0 }, + { X86::VPUNPCKLWDZrr, X86::VPUNPCKLWDZrm, 0 }, { X86::VPXORDZrr, X86::VPXORDZrm, 0 }, { X86::VPXORQZrr, X86::VPXORQZrm, 0 }, { X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 }, @@ -1884,6 +1892,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VSUBSDZrr_Int, X86::VSUBSDZrm_Int, TB_NO_REVERSE }, { X86::VSUBSSZrr, X86::VSUBSSZrm, 0 }, { X86::VSUBSSZrr_Int, X86::VSUBSSZrm_Int, TB_NO_REVERSE }, + { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrm, 0 }, + { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrm, 0 }, + { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrm, 0 }, + { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrm, 0 }, { X86::VXORPDZrr, X86::VXORPDZrm, 0 }, { X86::VXORPSZrr, X86::VXORPSZrm, 0 }, @@ -2015,6 +2027,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSWZ256rr, X86::VPSUBUSWZ256rm, 0 }, { X86::VPSUBWZ128rr, X86::VPSUBWZ128rm, 0 }, { X86::VPSUBWZ256rr, X86::VPSUBWZ256rm, 0 }, + { X86::VPUNPCKHBWZ128rr, X86::VPUNPCKHBWZ128rm, 0 }, + { X86::VPUNPCKHBWZ256rr, X86::VPUNPCKHBWZ256rm, 0 }, + { X86::VPUNPCKHDQZ128rr, X86::VPUNPCKHDQZ128rm, 0 }, + { X86::VPUNPCKHDQZ256rr, X86::VPUNPCKHDQZ256rm, 0 }, + { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm, 0 }, + { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm, 0 }, + { X86::VPUNPCKHWDZ128rr, X86::VPUNPCKHWDZ128rm, 0 }, + { X86::VPUNPCKHWDZ256rr, X86::VPUNPCKHWDZ256rm, 0 }, + { X86::VPUNPCKLBWZ128rr, X86::VPUNPCKLBWZ128rm, 0 }, + { X86::VPUNPCKLBWZ256rr, X86::VPUNPCKLBWZ256rm, 0 }, + { X86::VPUNPCKLDQZ128rr, X86::VPUNPCKLDQZ128rm, 0 }, + { X86::VPUNPCKLDQZ256rr, X86::VPUNPCKLDQZ256rm, 0 }, + { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm, 0 }, + { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm, 0 }, + { X86::VPUNPCKLWDZ128rr, X86::VPUNPCKLWDZ128rm, 0 }, + { X86::VPUNPCKLWDZ256rr, X86::VPUNPCKLWDZ256rm, 0 }, { X86::VPXORDZ128rr, X86::VPXORDZ128rm, 0 }, { X86::VPXORDZ256rr, X86::VPXORDZ256rm, 0 }, { X86::VPXORQZ128rr, X86::VPXORQZ128rm, 0 }, @@ -2023,6 +2051,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VSUBPDZ256rr, X86::VSUBPDZ256rm, 0 }, { X86::VSUBPSZ128rr, X86::VSUBPSZ128rm, 0 }, { X86::VSUBPSZ256rr, X86::VSUBPSZ256rm, 0 }, + { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rm, 0 }, + { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rm, 0 }, + { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rm, 0 }, + { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rm, 0 }, + { X86::VUNPCKLPDZ128rr, X86::VUNPCKLPDZ128rm, 0 }, + { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rm, 0 }, + { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rm, 0 }, + { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rm, 0 }, { X86::VXORPDZ128rr, X86::VXORPDZ128rm, 0 }, { X86::VXORPDZ256rr, X86::VXORPDZ256rm, 0 }, { X86::VXORPSZ128rr, X86::VXORPSZ128rm, 0 }, @@ -2241,10 +2277,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSBZrrkz, X86::VPSUBUSBZrmkz, 0 }, { X86::VPSUBUSWZrrkz, X86::VPSUBUSWZrmkz, 0 }, { X86::VPSUBWZrrkz, X86::VPSUBWZrmkz, 0 }, + { X86::VPUNPCKHBWZrrkz, X86::VPUNPCKHBWZrmkz, 0 }, + { X86::VPUNPCKHDQZrrkz, X86::VPUNPCKHDQZrmkz, 0 }, + { X86::VPUNPCKHQDQZrrkz, X86::VPUNPCKHQDQZrmkz, 0 }, + { X86::VPUNPCKHWDZrrkz, X86::VPUNPCKHWDZrmkz, 0 }, + { X86::VPUNPCKLBWZrrkz, X86::VPUNPCKLBWZrmkz, 0 }, + { X86::VPUNPCKLDQZrrkz, X86::VPUNPCKLDQZrmkz, 0 }, + { X86::VPUNPCKLQDQZrrkz, X86::VPUNPCKLQDQZrmkz, 0 }, + { X86::VPUNPCKLWDZrrkz, X86::VPUNPCKLWDZrmkz, 0 }, { X86::VPXORDZrrkz, X86::VPXORDZrmkz, 0 }, { X86::VPXORQZrrkz, X86::VPXORQZrmkz, 0 }, { X86::VSUBPDZrrkz, X86::VSUBPDZrmkz, 0 }, { X86::VSUBPSZrrkz, X86::VSUBPSZrmkz, 0 }, + { X86::VUNPCKHPDZrrkz, X86::VUNPCKHPDZrmkz, 0 }, + { X86::VUNPCKHPSZrrkz, X86::VUNPCKHPSZrmkz, 0 }, + { X86::VUNPCKLPDZrrkz, X86::VUNPCKLPDZrmkz, 0 }, + { X86::VUNPCKLPSZrrkz, X86::VUNPCKLPSZrmkz, 0 }, { X86::VXORPDZrrkz, X86::VXORPDZrmkz, 0 }, { X86::VXORPSZrrkz, X86::VXORPSZrmkz, 0 }, @@ -2291,10 +2339,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSBZ256rrkz, X86::VPSUBUSBZ256rmkz, 0 }, { X86::VPSUBUSWZ256rrkz, X86::VPSUBUSWZ256rmkz, 0 }, { X86::VPSUBWZ256rrkz, X86::VPSUBWZ256rmkz, 0 }, + { X86::VPUNPCKHBWZ256rrkz, X86::VPUNPCKHBWZ256rmkz, 0 }, + { X86::VPUNPCKHDQZ256rrkz, X86::VPUNPCKHDQZ256rmkz, 0 }, + { X86::VPUNPCKHQDQZ256rrkz, X86::VPUNPCKHQDQZ256rmkz, 0 }, + { X86::VPUNPCKHWDZ256rrkz, X86::VPUNPCKHWDZ256rmkz, 0 }, + { X86::VPUNPCKLBWZ256rrkz, X86::VPUNPCKLBWZ256rmkz, 0 }, + { X86::VPUNPCKLDQZ256rrkz, X86::VPUNPCKLDQZ256rmkz, 0 }, + { X86::VPUNPCKLQDQZ256rrkz, X86::VPUNPCKLQDQZ256rmkz, 0 }, + { X86::VPUNPCKLWDZ256rrkz, X86::VPUNPCKLWDZ256rmkz, 0 }, { X86::VPXORDZ256rrkz, X86::VPXORDZ256rmkz, 0 }, { X86::VPXORQZ256rrkz, X86::VPXORQZ256rmkz, 0 }, { X86::VSUBPDZ256rrkz, X86::VSUBPDZ256rmkz, 0 }, { X86::VSUBPSZ256rrkz, X86::VSUBPSZ256rmkz, 0 }, + { X86::VUNPCKHPDZ256rrkz, X86::VUNPCKHPDZ256rmkz, 0 }, + { X86::VUNPCKHPSZ256rrkz, X86::VUNPCKHPSZ256rmkz, 0 }, + { X86::VUNPCKLPDZ256rrkz, X86::VUNPCKLPDZ256rmkz, 0 }, + { X86::VUNPCKLPSZ256rrkz, X86::VUNPCKLPSZ256rmkz, 0 }, { X86::VXORPDZ256rrkz, X86::VXORPDZ256rmkz, 0 }, { X86::VXORPSZ256rrkz, X86::VXORPSZ256rmkz, 0 }, @@ -2341,10 +2401,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSUBUSBZ128rrkz, X86::VPSUBUSBZ128rmkz, 0 }, { X86::VPSUBUSWZ128rrkz, X86::VPSUBUSWZ128rmkz, 0 }, { X86::VPSUBWZ128rrkz, X86::VPSUBWZ128rmkz, 0 }, + { X86::VPUNPCKHBWZ128rrkz, X86::VPUNPCKHBWZ128rmkz, 0 }, + { X86::VPUNPCKHDQZ128rrkz, X86::VPUNPCKHDQZ128rmkz, 0 }, + { X86::VPUNPCKHQDQZ128rrkz, X86::VPUNPCKHQDQZ128rmkz, 0 }, + { X86::VPUNPCKHWDZ128rrkz, X86::VPUNPCKHWDZ128rmkz, 0 }, + { X86::VPUNPCKLBWZ128rrkz, X86::VPUNPCKLBWZ128rmkz, 0 }, + { X86::VPUNPCKLDQZ128rrkz, X86::VPUNPCKLDQZ128rmkz, 0 }, + { X86::VPUNPCKLQDQZ128rrkz, X86::VPUNPCKLQDQZ128rmkz, 0 }, + { X86::VPUNPCKLWDZ128rrkz, X86::VPUNPCKLWDZ128rmkz, 0 }, { X86::VPXORDZ128rrkz, X86::VPXORDZ128rmkz, 0 }, { X86::VPXORQZ128rrkz, X86::VPXORQZ128rmkz, 0 }, { X86::VSUBPDZ128rrkz, X86::VSUBPDZ128rmkz, 0 }, { X86::VSUBPSZ128rrkz, X86::VSUBPSZ128rmkz, 0 }, + { X86::VUNPCKHPDZ128rrkz, X86::VUNPCKHPDZ128rmkz, 0 }, + { X86::VUNPCKHPSZ128rrkz, X86::VUNPCKHPSZ128rmkz, 0 }, + { X86::VUNPCKLPDZ128rrkz, X86::VUNPCKLPDZ128rmkz, 0 }, + { X86::VUNPCKLPSZ128rrkz, X86::VUNPCKLPSZ128rmkz, 0 }, { X86::VXORPDZ128rrkz, X86::VXORPDZ128rmkz, 0 }, { X86::VXORPSZ128rrkz, X86::VXORPSZ128rmkz, 0 }, @@ -2461,10 +2533,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, { X86::VPTERNLOGQZrrik, X86::VPTERNLOGQZrmik, 0 }, { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, + { X86::VPUNPCKHBWZrrk, X86::VPUNPCKHBWZrmk, 0 }, + { X86::VPUNPCKHDQZrrk, X86::VPUNPCKHDQZrmk, 0 }, + { X86::VPUNPCKHQDQZrrk, X86::VPUNPCKHQDQZrmk, 0 }, + { X86::VPUNPCKHWDZrrk, X86::VPUNPCKHWDZrmk, 0 }, + { X86::VPUNPCKLBWZrrk, X86::VPUNPCKLBWZrmk, 0 }, + { X86::VPUNPCKLDQZrrk, X86::VPUNPCKLDQZrmk, 0 }, + { X86::VPUNPCKLQDQZrrk, X86::VPUNPCKLQDQZrmk, 0 }, + { X86::VPUNPCKLWDZrrk, X86::VPUNPCKLWDZrmk, 0 }, { X86::VPXORDZrrk, X86::VPXORDZrmk, 0 }, { X86::VPXORQZrrk, X86::VPXORQZrmk, 0 }, { X86::VSUBPDZrrk, X86::VSUBPDZrmk, 0 }, { X86::VSUBPSZrrk, X86::VSUBPSZrmk, 0 }, + { X86::VUNPCKHPDZrrk, X86::VUNPCKHPDZrmk, 0 }, + { X86::VUNPCKHPSZrrk, X86::VUNPCKHPSZrmk, 0 }, + { X86::VUNPCKLPDZrrk, X86::VUNPCKLPDZrmk, 0 }, + { X86::VUNPCKLPSZrrk, X86::VUNPCKLPSZrmk, 0 }, { X86::VXORPDZrrk, X86::VXORPDZrmk, 0 }, { X86::VXORPSZrrk, X86::VXORPSZrmk, 0 }, @@ -2515,10 +2599,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik, 0 }, { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, + { X86::VPUNPCKHBWZ256rrk, X86::VPUNPCKHBWZ256rmk, 0 }, + { X86::VPUNPCKHDQZ256rrk, X86::VPUNPCKHDQZ256rmk, 0 }, + { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk, 0 }, + { X86::VPUNPCKHWDZ256rrk, X86::VPUNPCKHWDZ256rmk, 0 }, + { X86::VPUNPCKLBWZ256rrk, X86::VPUNPCKLBWZ256rmk, 0 }, + { X86::VPUNPCKLDQZ256rrk, X86::VPUNPCKLDQZ256rmk, 0 }, + { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk, 0 }, + { X86::VPUNPCKLWDZ256rrk, X86::VPUNPCKLWDZ256rmk, 0 }, { X86::VPXORDZ256rrk, X86::VPXORDZ256rmk, 0 }, { X86::VPXORQZ256rrk, X86::VPXORQZ256rmk, 0 }, { X86::VSUBPDZ256rrk, X86::VSUBPDZ256rmk, 0 }, { X86::VSUBPSZ256rrk, X86::VSUBPSZ256rmk, 0 }, + { X86::VUNPCKHPDZ256rrk, X86::VUNPCKHPDZ256rmk, 0 }, + { X86::VUNPCKHPSZ256rrk, X86::VUNPCKHPSZ256rmk, 0 }, + { X86::VUNPCKLPDZ256rrk, X86::VUNPCKLPDZ256rmk, 0 }, + { X86::VUNPCKLPSZ256rrk, X86::VUNPCKLPSZ256rmk, 0 }, { X86::VXORPDZ256rrk, X86::VXORPDZ256rmk, 0 }, { X86::VXORPSZ256rrk, X86::VXORPSZ256rmk, 0 }, @@ -2569,10 +2665,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik, 0 }, { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, + { X86::VPUNPCKHBWZ128rrk, X86::VPUNPCKHBWZ128rmk, 0 }, + { X86::VPUNPCKHDQZ128rrk, X86::VPUNPCKHDQZ128rmk, 0 }, + { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk, 0 }, + { X86::VPUNPCKHWDZ128rrk, X86::VPUNPCKHWDZ128rmk, 0 }, + { X86::VPUNPCKLBWZ128rrk, X86::VPUNPCKLBWZ128rmk, 0 }, + { X86::VPUNPCKLDQZ128rrk, X86::VPUNPCKLDQZ128rmk, 0 }, + { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk, 0 }, + { X86::VPUNPCKLWDZ128rrk, X86::VPUNPCKLWDZ128rmk, 0 }, { X86::VPXORDZ128rrk, X86::VPXORDZ128rmk, 0 }, { X86::VPXORQZ128rrk, X86::VPXORQZ128rmk, 0 }, { X86::VSUBPDZ128rrk, X86::VSUBPDZ128rmk, 0 }, { X86::VSUBPSZ128rrk, X86::VSUBPSZ128rmk, 0 }, + { X86::VUNPCKHPDZ128rrk, X86::VUNPCKHPDZ128rmk, 0 }, + { X86::VUNPCKHPSZ128rrk, X86::VUNPCKHPSZ128rmk, 0 }, + { X86::VUNPCKLPDZ128rrk, X86::VUNPCKLPDZ128rmk, 0 }, + { X86::VUNPCKLPSZ128rrk, X86::VUNPCKLPSZ128rmk, 0 }, { X86::VXORPDZ128rrk, X86::VXORPDZ128rmk, 0 }, { X86::VXORPSZ128rrk, X86::VXORPSZ128rmk, 0 }, }; diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll index 600bfe4..577b1c0 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512.ll @@ -671,3 +671,33 @@ define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) { %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer ret <8 x i64> %4 } + +define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) { + ;CHECK-LABEL: stack_fold_punpckhbw_zmm + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> + ret <64 x i8> %2 +} + +define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) { + ;CHECK-LABEL: stack_fold_punpckhbw_mask_zmm + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> + %3 = bitcast i64 %mask to <64 x i1> + ; load needed to keep the operation from being scheduled about the asm block + %4 = load <64 x i8>, <64 x i8>* %passthru + %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4 + ret <64 x i8> %5 +} + +define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) { + ;CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> + %3 = bitcast i64 %mask to <64 x i1> + %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer + ret <64 x i8> %4 +} diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll index 6446ee1..d11e744 100644 --- a/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-int-avx512vl.ll @@ -932,3 +932,63 @@ define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a %6 = select <4 x i1> %5, <4 x i64> %3, <4 x i64> %passthru ret <4 x i64> %6 } + +define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) { + ;CHECK-LABEL: stack_fold_punpckhbw + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + ret <16 x i8> %2 +} + +define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_punpckhbw_mask + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + ; load needed to keep the operation from being scheduled about the asm block + %4 = load <16 x i8>, <16 x i8>* %passthru + %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4 + ret <16 x i8> %5 +} + +define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) { + ;CHECK-LABEL: stack_fold_punpckhbw_maskz + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> + %3 = bitcast i16 %mask to <16 x i1> + %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer + ret <16 x i8> %4 +} + +define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) { + ;CHECK-LABEL: stack_fold_punpckhbw_ymm + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> + ret <32 x i8> %2 +} + +define <32 x i8> @stack_fold_punpckhbw_mask_ymm(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_punpckhbw_mask_ymm + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> + %3 = bitcast i32 %mask to <32 x i1> + ; load needed to keep the operation from being scheduled about the asm block + %4 = load <32 x i8>, <32 x i8>* %passthru + %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4 + ret <32 x i8> %5 +} + +define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) { + ;CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm + ;CHECK: vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> + %3 = bitcast i32 %mask to <32 x i1> + %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer + ret <32 x i8> %4 +} diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll index 11cc4ce..4641e4a 100644 --- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll +++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll @@ -2423,8 +2423,8 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; AVX512VL-NEXT: movq %r14, %rsi ; AVX512VL-NEXT: callq __fixtfdi ; AVX512VL-NEXT: vmovq %rax, %xmm0 -; AVX512VL-NEXT: vmovdqa64 (%rsp), %xmm1 # 16-byte Reload -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VL-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VL-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VL-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VL-NEXT: addq $24, %rsp @@ -2473,8 +2473,8 @@ define <4 x i32> @fptosi_2f128_to_4i32(<2 x fp128> %a) nounwind { ; AVX512VLDQ-NEXT: movq %r14, %rsi ; AVX512VLDQ-NEXT: callq __fixtfdi ; AVX512VLDQ-NEXT: vmovq %rax, %xmm0 -; AVX512VLDQ-NEXT: vmovdqa64 (%rsp), %xmm1 # 16-byte Reload -; AVX512VLDQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX512VLDQ-NEXT: vpunpcklqdq (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload +; AVX512VLDQ-NEXT: # xmm0 = xmm0[0],mem[0] ; AVX512VLDQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; AVX512VLDQ-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX512VLDQ-NEXT: addq $24, %rsp -- 2.7.4