From 0e3f659137189abac6f732b6a576d5c0e2db8383 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 29 Sep 2019 18:43:08 +0000 Subject: [PATCH] [X86] Add custom isel logic to match VPTERNLOG from 2 logic ops. There's room from improvement here, but this is a decent starting point. There are a few minor regressions in the vector-rotate tests, where we are now forming a vpternlog from an and before we get a chance to form it for a bitselect that we were matching previously. This results in an AND and an ANDN feeding the vpternlog where previously we just had an AND after the vpternlog. I think we can probably DAG combine the AND with the bitselect to get back to similar codegen. llvm-svn: 373172 --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 80 ++++++++++- llvm/test/CodeGen/X86/avx512-cvt.ll | 17 ++- llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll | 144 ++++++++++---------- llvm/test/CodeGen/X86/machine-combiner-int-vec.ll | 117 ++++++++++------ llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 12 +- llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 24 ++-- llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 84 ++++++------ llvm/test/CodeGen/X86/sadd_sat_vec.ll | 57 ++++++-- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 57 ++++++-- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 16 +-- llvm/test/CodeGen/X86/vector-bitreverse.ll | 43 +++--- llvm/test/CodeGen/X86/vector-fshl-256.ll | 9 +- llvm/test/CodeGen/X86/vector-fshl-512.ll | 56 ++++---- llvm/test/CodeGen/X86/vector-fshl-rot-256.ll | 20 ++- llvm/test/CodeGen/X86/vector-fshl-rot-512.ll | 42 +++--- llvm/test/CodeGen/X86/vector-fshr-256.ll | 29 ++-- llvm/test/CodeGen/X86/vector-fshr-512.ll | 154 +++++++++++----------- llvm/test/CodeGen/X86/vector-fshr-rot-256.ll | 20 ++- llvm/test/CodeGen/X86/vector-fshr-rot-512.ll | 82 ++++++------ llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll | 6 +- llvm/test/CodeGen/X86/vector-rotate-128.ll | 77 ++++++++--- llvm/test/CodeGen/X86/vector-rotate-256.ll | 69 ++++++---- llvm/test/CodeGen/X86/vector-rotate-512.ll | 92 ++++++------- llvm/test/CodeGen/X86/vector-shift-ashr-128.ll | 3 +- llvm/test/CodeGen/X86/vector-shift-ashr-256.ll | 18 ++- llvm/test/CodeGen/X86/vector-shift-ashr-512.ll | 18 ++- llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll | 9 +- 27 files changed, 766 insertions(+), 589 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index cb1c7d3..7cdb1db 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -509,6 +509,7 @@ namespace { bool tryShiftAmountMod(SDNode *N); bool combineIncDecVector(SDNode *Node); bool tryShrinkShlLogicImm(SDNode *N); + bool tryVPTERNLOG(SDNode *N); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); bool tryMatchBitSelect(SDNode *N); @@ -3813,6 +3814,82 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { return true; } +// Try to match two logic ops to a VPTERNLOG. +// FIXME: Handle inverted inputs? +// FIXME: Handle more complex patterns that use an operand more than once? +// FIXME: Support X86ISD::ANDNP +bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { + MVT NVT = N->getSimpleValueType(0); + + // Make sure we support VPTERNLOG. + if (!NVT.isVector() || !Subtarget->hasAVX512() || + NVT.getVectorElementType() == MVT::i1) + return false; + + // We need VLX for 128/256-bit. + if (!(Subtarget->hasVLX() || NVT.is512BitVector())) + return false; + + unsigned Opc1 = N->getOpcode(); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + auto isLogicOp = [](unsigned Opc) { + return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR; + }; + + SDValue A, B, C; + unsigned Opc2; + if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) { + Opc2 = N1.getOpcode(); + A = N0; + B = N1.getOperand(0); + C = N1.getOperand(1); + } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) { + Opc2 = N0.getOpcode(); + A = N1; + B = N0.getOperand(0); + C = N0.getOperand(1); + } else + return false; + + uint64_t Imm; + switch (Opc1) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: + switch (Opc2) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: Imm = 0x80; break; + case ISD::OR: Imm = 0xe0; break; + case ISD::XOR: Imm = 0x60; break; + } + break; + case ISD::OR: + switch (Opc2) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: Imm = 0xf8; break; + case ISD::OR: Imm = 0xfe; break; + case ISD::XOR: Imm = 0xf6; break; + } + break; + case ISD::XOR: + switch (Opc2) { + default: llvm_unreachable("Unexpected opcode!"); + case ISD::AND: Imm = 0x78; break; + case ISD::OR: Imm = 0x1e; break; + case ISD::XOR: Imm = 0x96; break; + } + break; + } + + SDLoc DL(N); + SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C, + CurDAG->getTargetConstant(Imm, DL, MVT::i8)); + ReplaceNode(N, New.getNode()); + SelectCode(New.getNode()); + return true; +} + /// Convert vector increment or decrement to sub/add with an all-ones constant: /// add X, <1, 1...> --> sub X, <-1, -1...> /// sub X, <1, 1...> --> add X, <-1, -1...> @@ -4482,9 +4559,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case ISD::XOR: if (tryShrinkShlLogicImm(Node)) return; - if (Opcode == ISD::OR && tryMatchBitSelect(Node)) return; + if (tryVPTERNLOG(Node)) + return; LLVM_FALLTHROUGH; case ISD::ADD: diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index 6f72473..10f3d53 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -331,8 +331,8 @@ define <4 x float> @ulto4f32(<4 x i64> %a) { define <8 x double> @ulto8f64(<8 x i64> %a) { ; NODQ-LABEL: ulto8f64: ; NODQ: # %bb.0: -; NODQ-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 -; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm1, %zmm1 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; NODQ-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm0, %zmm1 ; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 ; NODQ-NEXT: vporq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; NODQ-NEXT: vsubpd {{.*}}(%rip){1to8}, %zmm0, %zmm0 @@ -356,21 +356,20 @@ define <16 x double> @ulto16f64(<16 x i64> %a) { ; NODQ-LABEL: ulto16f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295,4294967295] -; NODQ-NEXT: vpandq %zmm2, %zmm0, %zmm3 -; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm4 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] -; NODQ-NEXT: vporq %zmm4, %zmm3, %zmm3 +; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm3 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; NODQ-NEXT: vmovdqa64 %zmm3, %zmm4 +; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm0, %zmm4 ; NODQ-NEXT: vpsrlq $32, %zmm0, %zmm0 ; NODQ-NEXT: vpbroadcastq {{.*#+}} zmm5 = [4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072,4985484787499139072] ; NODQ-NEXT: vporq %zmm5, %zmm0, %zmm0 ; NODQ-NEXT: vbroadcastsd {{.*#+}} zmm6 = [1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25,1.9342813118337666E+25] ; NODQ-NEXT: vsubpd %zmm6, %zmm0, %zmm0 -; NODQ-NEXT: vaddpd %zmm0, %zmm3, %zmm0 -; NODQ-NEXT: vpandq %zmm2, %zmm1, %zmm2 -; NODQ-NEXT: vporq %zmm4, %zmm2, %zmm2 +; NODQ-NEXT: vaddpd %zmm0, %zmm4, %zmm0 +; NODQ-NEXT: vpternlogq $248, %zmm2, %zmm1, %zmm3 ; NODQ-NEXT: vpsrlq $32, %zmm1, %zmm1 ; NODQ-NEXT: vporq %zmm5, %zmm1, %zmm1 ; NODQ-NEXT: vsubpd %zmm6, %zmm1, %zmm1 -; NODQ-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; NODQ-NEXT: vaddpd %zmm1, %zmm3, %zmm1 ; NODQ-NEXT: retq ; ; VLDQ-LABEL: ulto16f64: diff --git a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll index fbb2876..7e5e3e8 100644 --- a/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-gfni-intrinsics.ll @@ -7,21 +7,21 @@ define <16 x i8> @test_vgf2p8affineinvqb_128(<16 x i8> %src1, <16 x i8> %src2, < ; X86-LABEL: test_vgf2p8affineinvqb_128: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xd9,0x03] ; X86-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x03] -; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] -; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] +; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineinvqb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xd9,0x03] -; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xe1,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xcf,0xe1,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xcf,0xd9,0x03] ; X64-NEXT: vgf2p8affineinvqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xcf,0xd1,0x03] -; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] -; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] +; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8affineinvqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) @@ -37,21 +37,21 @@ define <32 x i8> @test_vgf2p8affineinvqb_256(<32 x i8> %src1, <32 x i8> %src2, < ; X86-LABEL: test_vgf2p8affineinvqb_256: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xd9,0x03] ; X86-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x03] -; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] -; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] +; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineinvqb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xd9,0x03] -; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xe1,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xcf,0xe1,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xcf,0xd9,0x03] ; X64-NEXT: vgf2p8affineinvqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xcf,0xd1,0x03] -; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] -; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] +; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8affineinvqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) @@ -67,21 +67,21 @@ define <64 x i8> @test_vgf2p8affineinvqb_512(<64 x i8> %src1, <64 x i8> %src2, < ; X86-LABEL: test_vgf2p8affineinvqb_512: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x03] +; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xd9,0x03] ; X86-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x03] -; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] -; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] +; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineinvqb_512: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xd9,0x03] -; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xe1,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xcf,0xe1,0x03] +; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xcf,0xd9,0x03] ; X64-NEXT: vgf2p8affineinvqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xcf,0xd1,0x03] -; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] -; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] +; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8affineinvqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) @@ -97,21 +97,21 @@ define <16 x i8> @test_vgf2p8affineqb_128(<16 x i8> %src1, <16 x i8> %src2, <16 ; X86-LABEL: test_vgf2p8affineqb_128: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x03] +; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x03] +; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xd9,0x03] ; X86-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x03] -; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] -; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] +; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineqb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xd9,0x03] -; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xe1,0x03] +; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0xce,0xe1,0x03] +; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0xce,0xd9,0x03] ; X64-NEXT: vgf2p8affineqb $3, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0xce,0xd1,0x03] -; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] -; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] +; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8affineqb.128(<16 x i8> %src1, <16 x i8> %src2, i8 3) @@ -127,21 +127,21 @@ define <32 x i8> @test_vgf2p8affineqb_256(<32 x i8> %src1, <32 x i8> %src2, <32 ; X86-LABEL: test_vgf2p8affineqb_256: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x03] +; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x03] +; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xd9,0x03] ; X86-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x03] -; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] -; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] +; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineqb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xd9,0x03] -; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xe1,0x03] +; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0xce,0xe1,0x03] +; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0xce,0xd9,0x03] ; X64-NEXT: vgf2p8affineqb $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0xce,0xd1,0x03] -; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] -; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] +; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8affineqb.256(<32 x i8> %src1, <32 x i8> %src2, i8 3) @@ -157,21 +157,21 @@ define <64 x i8> @test_vgf2p8affineqb_512(<64 x i8> %src1, <64 x i8> %src2, <64 ; X86-LABEL: test_vgf2p8affineqb_512: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x03] +; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x03] +; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xd9,0x03] ; X86-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x03] -; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] -; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] +; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8affineqb_512: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xd9,0x03] -; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xe1,0x03] +; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf3,0xfd,0x48,0xce,0xe1,0x03] +; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0xce,0xd9,0x03] ; X64-NEXT: vgf2p8affineqb $3, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0xce,0xd1,0x03] -; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] -; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] +; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8affineqb.512(<64 x i8> %src1, <64 x i8> %src2, i8 3) @@ -187,21 +187,21 @@ define <16 x i8> @test_vgf2p8mulb_128(<16 x i8> %src1, <16 x i8> %src2, <16 x i8 ; X86-LABEL: test_vgf2p8mulb_128: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xd9] -; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xe1] +; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xe1] +; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xd9] ; X86-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] -; X86-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] -; X86-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] +; X86-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8mulb_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xd9] -; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xe1] +; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xcf,0xe1] +; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0xcf,0xd9] ; X64-NEXT: vgf2p8mulb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xcf,0xd1] -; X64-NEXT: vpxor %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xc3] -; X64-NEXT: vpxor %xmm0, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %xmm2, %xmm4, %xmm3 # encoding: [0x62,0xf3,0xdd,0x08,0x25,0xda,0x96] +; X64-NEXT: vmovdqa %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i16 %mask to <16 x i1> %2 = call <16 x i8> @llvm.x86.vgf2p8mulb.128(<16 x i8> %src1, <16 x i8> %src2) @@ -217,21 +217,21 @@ define <32 x i8> @test_vgf2p8mulb_256(<32 x i8> %src1, <32 x i8> %src2, <32 x i8 ; X86-LABEL: test_vgf2p8mulb_256: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xd9] -; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xe1] +; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xe1] +; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xd9] ; X86-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] -; X86-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] -; X86-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] +; X86-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8mulb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xd9] -; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xe1] +; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xcf,0xe1] +; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0xcf,0xd9] ; X64-NEXT: vgf2p8mulb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0xcf,0xd1] -; X64-NEXT: vpxor %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xc3] -; X64-NEXT: vpxor %ymm0, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %ymm2, %ymm4, %ymm3 # encoding: [0x62,0xf3,0xdd,0x28,0x25,0xda,0x96] +; X64-NEXT: vmovdqa %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i32 %mask to <32 x i1> %2 = call <32 x i8> @llvm.x86.vgf2p8mulb.256(<32 x i8> %src1, <32 x i8> %src2) @@ -247,21 +247,21 @@ define <64 x i8> @test_vgf2p8mulb_512(<64 x i8> %src1, <64 x i8> %src2, <64 x i8 ; X86-LABEL: test_vgf2p8mulb_512: ; X86: # %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xd9] -; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xe1] +; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xe1] +; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xd9] ; X86-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] -; X86-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] -; X86-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] +; X86-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] +; X86-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_vgf2p8mulb_512: ; X64: # %bb.0: ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] -; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xd9] -; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xe1] +; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm4 # encoding: [0x62,0xf2,0x7d,0x48,0xcf,0xe1] +; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcf,0xd9] ; X64-NEXT: vgf2p8mulb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0xcf,0xd1] -; X64-NEXT: vpxorq %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xef,0xc3] -; X64-NEXT: vpxorq %zmm0, %zmm4, %zmm0 # encoding: [0x62,0xf1,0xdd,0x48,0xef,0xc0] +; X64-NEXT: vpternlogq $150, %zmm2, %zmm4, %zmm3 # encoding: [0x62,0xf3,0xdd,0x48,0x25,0xda,0x96] +; X64-NEXT: vmovdqa64 %zmm3, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc3] ; X64-NEXT: retq # encoding: [0xc3] %1 = bitcast i64 %mask to <64 x i1> %2 = call <64 x i8> @llvm.x86.vgf2p8mulb.512(<64 x i8> %src1, <64 x i8> %src2) diff --git a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll index e4420be..4e07b4a 100644 --- a/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll +++ b/llvm/test/CodeGen/X86/machine-combiner-int-vec.ll @@ -13,12 +13,18 @@ define <4 x i32> @reassociate_and_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: reassociate_and_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: reassociate_and_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm1 +; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: reassociate_and_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogd $128, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: retq %t0 = add <4 x i32> %x0, %x1 %t1 = and <4 x i32> %x2, %t0 @@ -34,12 +40,18 @@ define <4 x i32> @reassociate_or_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> % ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: reassociate_or_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: reassociate_or_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm1 +; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: reassociate_or_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogd $254, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: retq %t0 = add <4 x i32> %x0, %x1 %t1 = or <4 x i32> %x2, %t0 @@ -55,12 +67,18 @@ define <4 x i32> @reassociate_xor_v4i32(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> ; SSE-NEXT: pxor %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: reassociate_xor_v4i32: -; AVX: # %bb.0: -; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpxor %xmm3, %xmm2, %xmm1 -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX2-LABEL: reassociate_xor_v4i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: reassociate_xor_v4i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogd $150, %xmm2, %xmm3, %xmm0 +; AVX512-NEXT: retq %t0 = add <4 x i32> %x0, %x1 %t1 = xor <4 x i32> %x2, %t0 @@ -81,12 +99,18 @@ define <8 x i32> @reassociate_and_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> ; SSE-NEXT: pand %xmm5, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: reassociate_and_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpand %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX2-LABEL: reassociate_and_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: reassociate_and_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogd $128, %ymm2, %ymm3, %ymm0 +; AVX512-NEXT: retq %t0 = add <8 x i32> %x0, %x1 %t1 = and <8 x i32> %x2, %t0 @@ -105,12 +129,18 @@ define <8 x i32> @reassociate_or_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> % ; SSE-NEXT: por %xmm5, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: reassociate_or_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpor %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX2-LABEL: reassociate_or_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: reassociate_or_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogd $254, %ymm2, %ymm3, %ymm0 +; AVX512-NEXT: retq %t0 = add <8 x i32> %x0, %x1 %t1 = or <8 x i32> %x2, %t0 @@ -129,12 +159,18 @@ define <8 x i32> @reassociate_xor_v8i32(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> ; SSE-NEXT: pxor %xmm5, %xmm1 ; SSE-NEXT: retq ; -; AVX-LABEL: reassociate_xor_v8i32: -; AVX: # %bb.0: -; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 -; AVX-NEXT: vpxor %ymm3, %ymm2, %ymm1 -; AVX-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX-NEXT: retq +; AVX2-LABEL: reassociate_xor_v8i32: +; AVX2: # %bb.0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: reassociate_xor_v8i32: +; AVX512: # %bb.0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpternlogd $150, %ymm2, %ymm3, %ymm0 +; AVX512-NEXT: retq %t0 = add <8 x i32> %x0, %x1 %t1 = xor <8 x i32> %x2, %t0 @@ -175,8 +211,7 @@ define <16 x i32> @reassociate_and_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x ; AVX512-LABEL: reassociate_and_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpandd %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpandd %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd $128, %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -215,8 +250,7 @@ define <16 x i32> @reassociate_or_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x i ; AVX512-LABEL: reassociate_or_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpord %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd $254, %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 @@ -255,8 +289,7 @@ define <16 x i32> @reassociate_xor_v16i32(<16 x i32> %x0, <16 x i32> %x1, <16 x ; AVX512-LABEL: reassociate_xor_v16i32: ; AVX512: # %bb.0: ; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vpxord %zmm3, %zmm2, %zmm1 -; AVX512-NEXT: vpxord %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpternlogd $150, %zmm2, %zmm3, %zmm0 ; AVX512-NEXT: retq %t0 = add <16 x i32> %x0, %x1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll index 7f0e19e..f78ab3b 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll @@ -2263,12 +2263,12 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %xmm1, %xmm0, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %xmm3, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %xmm3, %xmm1, %xmm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %xmm4, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -3123,14 +3123,14 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %xmm1, %xmm0, %xmm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %xmm3, %xmm3, %xmm3 -; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %xmm3, %xmm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %xmm3, %xmm4 ; AVX512VL-FALLBACK-NEXT: vpmaxub %xmm1, %xmm0, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %xmm2, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 ; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero,xmm3[8],zero,xmm3[9],zero,xmm3[10],zero,xmm3[11],zero,xmm3[12],zero,xmm3[13],zero,xmm3[14],zero,xmm3[15],zero +; AVX512VL-FALLBACK-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero,xmm4[8],zero,xmm4[9],zero,xmm4[10],zero,xmm4[11],zero,xmm4[12],zero,xmm4[13],zero,xmm4[14],zero,xmm4[15],zero ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero ; AVX512VL-FALLBACK-NEXT: vpmovdb %zmm1, %xmm1 diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll index 0d28d61..15723c6 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll @@ -1855,12 +1855,12 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm3, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq ; @@ -2789,21 +2789,21 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw ; AVX512VL-FALLBACK: # %bb.0: ; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm3, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpor {{.*}}(%rip), %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm4, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpternlogq $190, {{.*}}(%rip), %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm2, %ymm2 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm2, %ymm2 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm4, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm3, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll index c3743ca..5414cdd 100644 --- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll +++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll @@ -418,21 +418,20 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpminuw %ymm2, %ymm3, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm4, %ymm3, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm7, %ymm0, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpminuw %ymm1, %ymm0, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm8, %ymm0, %ymm9 +; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm9 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubw %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubw %ymm8, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpsubw %ymm4, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm9, %ymm1, %ymm1 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 @@ -877,45 +876,44 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw ; ; AVX512VL-FALLBACK-LABEL: vec512_i8_unsigned_reg_reg: ; AVX512VL-FALLBACK: # %bb.0: -; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm3 +; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm1, %ymm4 ; AVX512VL-FALLBACK-NEXT: vextracti64x4 $1, %zmm0, %ymm2 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm3, %ymm2, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm5, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm5, %ymm5 -; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm7 -; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm7, %ymm0, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm8, %ymm8, %ymm8 -; AVX512VL-FALLBACK-NEXT: vpor %ymm6, %ymm8, %ymm6 -; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm3, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm4, %ymm2, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm5, %ymm2, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpminub %ymm1, %ymm0, %ymm8 +; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm8, %ymm0, %ymm9 +; AVX512VL-FALLBACK-NEXT: vpternlogq $222, %ymm6, %ymm7, %ymm9 +; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm4, %ymm2, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpmaxub %ymm1, %ymm0, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm7, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpsubb %ymm4, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm4 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] -; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm8, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpsubb %ymm5, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm5 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm4, %ymm4 ; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm6[8],ymm0[8],ymm6[9],ymm0[9],ymm6[10],ymm0[10],ymm6[11],ymm0[11],ymm6[12],ymm0[12],ymm6[13],ymm0[13],ymm6[14],ymm0[14],ymm6[15],ymm0[15],ymm6[24],ymm0[24],ymm6[25],ymm0[25],ymm6[26],ymm0[26],ymm6[27],ymm0[27],ymm6[28],ymm0[28],ymm6[29],ymm0[29],ymm6[30],ymm0[30],ymm6[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpand %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm9[8],ymm0[8],ymm9[9],ymm0[9],ymm9[10],ymm0[10],ymm9[11],ymm0[11],ymm9[12],ymm0[12],ymm9[13],ymm0[13],ymm9[14],ymm0[14],ymm9[15],ymm0[15],ymm9[24],ymm0[24],ymm9[25],ymm0[25],ymm9[26],ymm0[26],ymm9[27],ymm0[27],ymm9[28],ymm0[28],ymm9[29],ymm0[29],ymm9[30],ymm0[30],ymm9[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm5, %ymm5 ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[2],ymm0[2],ymm6[3],ymm0[3],ymm6[4],ymm0[4],ymm6[5],ymm0[5],ymm6[6],ymm0[6],ymm6[7],ymm0[7],ymm6[16],ymm0[16],ymm6[17],ymm0[17],ymm6[18],ymm0[18],ymm6[19],ymm0[19],ymm6[20],ymm0[20],ymm6[21],ymm0[21],ymm6[22],ymm0[22],ymm6[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm5[8],ymm0[8],ymm5[9],ymm0[9],ymm5[10],ymm0[10],ymm5[11],ymm0[11],ymm5[12],ymm0[12],ymm5[13],ymm0[13],ymm5[14],ymm0[14],ymm5[15],ymm0[15],ymm5[24],ymm0[24],ymm5[25],ymm0[25],ymm5[26],ymm0[26],ymm5[27],ymm0[27],ymm5[28],ymm0[28],ymm5[29],ymm0[29],ymm5[30],ymm0[30],ymm5[31],ymm0[31] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm6, %ymm4, %ymm4 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm7 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[2],ymm0[2],ymm9[3],ymm0[3],ymm9[4],ymm0[4],ymm9[5],ymm0[5],ymm9[6],ymm0[6],ymm9[7],ymm0[7],ymm9[16],ymm0[16],ymm9[17],ymm0[17],ymm9[18],ymm0[18],ymm9[19],ymm0[19],ymm9[20],ymm0[20],ymm9[21],ymm0[21],ymm9[22],ymm0[22],ymm9[23],ymm0[23] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm1, %ymm1 +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm4[8],ymm0[8],ymm4[9],ymm0[9],ymm4[10],ymm0[10],ymm4[11],ymm0[11],ymm4[12],ymm0[12],ymm4[13],ymm0[13],ymm4[14],ymm0[14],ymm4[15],ymm0[15],ymm4[24],ymm0[24],ymm4[25],ymm0[25],ymm4[26],ymm0[26],ymm4[27],ymm0[27],ymm4[28],ymm0[28],ymm4[29],ymm0[29],ymm4[30],ymm0[30],ymm4[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm7, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm5, %ymm5 +; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm0[0],ymm4[1],ymm0[1],ymm4[2],ymm0[2],ymm4[3],ymm0[3],ymm4[4],ymm0[4],ymm4[5],ymm0[5],ymm4[6],ymm0[6],ymm4[7],ymm0[7],ymm4[16],ymm0[16],ymm4[17],ymm0[17],ymm4[18],ymm0[18],ymm4[19],ymm0[19],ymm4[20],ymm0[20],ymm4[21],ymm0[21],ymm4[22],ymm0[22],ymm4[23],ymm0[23] ; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm0[0],ymm5[1],ymm0[1],ymm5[2],ymm0[2],ymm5[3],ymm0[3],ymm5[4],ymm0[4],ymm5[5],ymm0[5],ymm5[6],ymm0[6],ymm5[7],ymm0[7],ymm5[16],ymm0[16],ymm5[17],ymm0[17],ymm5[18],ymm0[18],ymm5[19],ymm0[19],ymm5[20],ymm0[20],ymm5[21],ymm0[21],ymm5[22],ymm0[22],ymm5[23],ymm0[23] -; AVX512VL-FALLBACK-NEXT: vpmullw %ymm5, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpand %ymm7, %ymm3, %ymm3 -; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm4, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpand %ymm6, %ymm3, %ymm3 +; AVX512VL-FALLBACK-NEXT: vpackuswb %ymm5, %ymm3, %ymm3 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm2, %ymm3, %ymm2 ; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX512VL-FALLBACK-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index 4d18515..4ec3d21 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -495,20 +495,49 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %z = call <16 x i4> @llvm.sadd.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index dd8d546..605b2f1 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -495,20 +495,49 @@ define <16 x i4> @v16i4(<16 x i4> %x, <16 x i4> %y) nounwind { ; SSE-NEXT: psubb %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: v16i4: -; AVX: # %bb.0: -; AVX-NEXT: vpsllw $4, %xmm1, %xmm1 -; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX-NEXT: vpsllw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: v16i4: +; AVX1: # %bb.0: +; AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: v16i4: +; AVX2: # %bb.0: +; AVX2-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX2-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: v16i4: +; AVX512: # %bb.0: +; AVX512-NEXT: vpsllw $4, %xmm1, %xmm1 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vpsllw $4, %xmm0, %xmm0 +; AVX512-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: retq %z = call <16 x i4> @llvm.ssub.sat.v16i4(<16 x i4> %x, <16 x i4> %y) ret <16 x i4> %z } diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 6b86345..1570fdc 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -552,8 +552,8 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { ; ; AVX512VL-LABEL: uitofp_2i64_to_2f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4841369599423283200,4841369599423283200] +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 @@ -905,8 +905,8 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { ; ; AVX512VL-LABEL: uitofp_4i64_to_4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 -; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip){1to4}, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 @@ -3464,8 +3464,8 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { ; AVX512VL-LABEL: uitofp_load_2i64_to_2f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm1, %xmm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [4841369599423283200,4841369599423283200] +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm0, %xmm1 ; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 ; AVX512VL-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip), %xmm0, %xmm0 @@ -3847,8 +3847,8 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { ; AVX512VL-LABEL: uitofp_load_4i64_to_4f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512VL-NEXT: vpandq {{.*}}(%rip){1to4}, %ymm0, %ymm1 -; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4841369599423283200,4841369599423283200,4841369599423283200,4841369599423283200] +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip){1to4}, %ymm0, %ymm1 ; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX512VL-NEXT: vporq {{.*}}(%rip){1to4}, %ymm0, %ymm0 ; AVX512VL-NEXT: vsubpd {{.*}}(%rip){1to4}, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-bitreverse.ll b/llvm/test/CodeGen/X86/vector-bitreverse.ll index d28af7ad..ddb90c7 100644 --- a/llvm/test/CodeGen/X86/vector-bitreverse.ll +++ b/llvm/test/CodeGen/X86/vector-bitreverse.ll @@ -1911,15 +1911,13 @@ define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind { ; ; AVX512F-LABEL: test_bitreverse_v16i32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 -; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm2 -; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm2, %zmm2 -; AVX512F-NEXT: vpord %zmm1, %zmm2, %zmm1 +; AVX512F-NEXT: vpslld $8, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $24, %zmm0, %zmm2 -; AVX512F-NEXT: vpslld $8, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogd $248, {{.*}}(%rip){1to16}, %zmm1, %zmm2 +; AVX512F-NEXT: vpsrld $24, %zmm0, %zmm1 +; AVX512F-NEXT: vpsrld $8, %zmm0, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpord %zmm0, %zmm2, %zmm0 +; AVX512F-NEXT: vpternlogd $254, %zmm1, %zmm2, %zmm0 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm1 ; AVX512F-NEXT: vpslld $4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm0, %zmm0 @@ -2219,27 +2217,22 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; ; AVX512F-LABEL: test_bitreverse_v8i64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm1 -; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vpsllq $40, %zmm0, %zmm1 +; AVX512F-NEXT: vpsllq $56, %zmm0, %zmm2 +; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm1, %zmm2 +; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm1 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 -; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2 -; AVX512F-NEXT: vpsllq $56, %zmm0, %zmm3 -; AVX512F-NEXT: vpsllq $40, %zmm0, %zmm0 +; AVX512F-NEXT: vpternlogq $254, %zmm1, %zmm2, %zmm3 +; AVX512F-NEXT: vpsrlq $40, %zmm0, %zmm1 +; AVX512F-NEXT: vpsrlq $56, %zmm0, %zmm2 +; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm1, %zmm2 +; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm1 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm0 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512F-NEXT: vpternlogq $248, {{.*}}(%rip){1to8}, %zmm1, %zmm0 +; AVX512F-NEXT: vpternlogq $254, %zmm2, %zmm3, %zmm0 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm1 ; AVX512F-NEXT: vpsllq $4, %zmm1, %zmm1 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm0, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll index cf8a80c..ff77e4e 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll @@ -1550,11 +1550,10 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VL-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 ; AVX512VL-NEXT: vpsrlw $8, %xmm4, %xmm4 ; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand %ymm4, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm3, %ymm1 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm1, %ymm3, %ymm4 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm0, %ymm4, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll index b6c5d9f..5c3f9da 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll @@ -867,16 +867,14 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VL-NEXT: vpsrlw %xmm3, %xmm8, %xmm6 ; AVX512VL-NEXT: vpsrlw $8, %xmm6, %xmm6 ; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 -; AVX512VL-NEXT: vpor %ymm8, %ymm10, %ymm8 -; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpternlogq $236, %ymm6, %ymm10, %ymm9 +; AVX512VL-NEXT: vpxor %xmm8, %xmm8, %xmm8 +; AVX512VL-NEXT: vpcmpeqb %ymm8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm9, %ymm4 ; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm5 ; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 ; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm6, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm1, %ymm5, %ymm1 +; AVX512VL-NEXT: vpternlogq $236, %ymm6, %ymm5, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq @@ -894,12 +892,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 -; AVX512BW-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512BW-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512BW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 +; AVX512BW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -918,12 +915,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 -; AVX512VBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 +; AVX512VBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512VBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -942,12 +938,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512VLBW-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLBW-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 +; AVX512VLBW-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512VLBW-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLBW-NEXT: vmovdqa64 %zmm1, %zmm0 @@ -966,12 +961,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 ; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm4 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm4, %xmm4 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm1, %zmm1 -; AVX512VLVBMI2-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %zmm1, %zmm6 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm4, %xmm5, %xmm1 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm6, %zmm3, %zmm1 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} ; AVX512VLVBMI2-NEXT: vmovdqa64 %zmm1, %zmm0 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index ca624b0..41c538d 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -446,12 +446,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i8: @@ -837,12 +836,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 -; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll index 8cb0f36..6b7fc3d 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-512.ll @@ -186,12 +186,11 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm9 -; AVX512VL-NEXT: vpor %ymm5, %ymm9, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm7 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm9 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpternlogq $248, %ymm9, %ymm5, %ymm7 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm7, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 @@ -204,11 +203,10 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm7, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpternlogq $248, %ymm9, %ymm3, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -435,13 +433,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -460,12 +456,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: @@ -483,12 +478,11 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 ; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm0, %zmm3, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $236, %zmm2, %zmm3, %zmm0 ; AVX512VLBW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshl.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll index 8898373..b62675b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll @@ -1538,24 +1538,23 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> % ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpsllw %xmm3, %xmm4, %xmm3 +; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm4 -; AVX512VL-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VL-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 +; AVX512VL-NEXT: vpsrlw %xmm3, %ymm1, %ymm5 +; AVX512VL-NEXT: vpsrlw %xmm3, %xmm4, %xmm3 ; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3 ; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 -; AVX512VL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsllw %xmm4, %xmm5, %xmm4 -; AVX512VL-NEXT: vpbroadcastb %xmm4, %ymm4 -; AVX512VL-NEXT: vpand %ymm4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm0, %ymm3 +; AVX512VL-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX512VL-NEXT: vpcmpeqb %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm0, %ymm1, %ymm3, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll index ca559a6..21920e3d 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll @@ -838,34 +838,32 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; ; AVX512VL-LABEL: splatvar_funnnel_v64i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm9 -; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm4 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm1, %ymm10 +; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm4 ; AVX512VL-NEXT: vpbroadcastb %xmm2, %ymm2 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm2, %ymm2 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm4, %ymm6 +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm5 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VL-NEXT: vpsubb %xmm2, %xmm5, %xmm5 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,zero,zero,zero,zero,xmm5[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsllw %xmm5, %ymm4, %ymm4 ; AVX512VL-NEXT: vpcmpeqd %xmm8, %xmm8, %xmm8 -; AVX512VL-NEXT: vpsrlw %xmm5, %xmm8, %xmm7 -; AVX512VL-NEXT: vpsrlw $8, %xmm7, %xmm7 +; AVX512VL-NEXT: vpsllw %xmm5, %xmm8, %xmm7 ; AVX512VL-NEXT: vpbroadcastb %xmm7, %ymm7 -; AVX512VL-NEXT: vpand %ymm7, %ymm6, %ymm10 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VL-NEXT: vpsubb %xmm2, %xmm3, %xmm3 -; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsllw %xmm3, %ymm9, %ymm9 -; AVX512VL-NEXT: vpsllw %xmm3, %xmm8, %xmm6 -; AVX512VL-NEXT: vpbroadcastb %xmm6, %ymm6 -; AVX512VL-NEXT: vpand %ymm6, %ymm9, %ymm8 -; AVX512VL-NEXT: vpor %ymm10, %ymm8, %ymm8 -; AVX512VL-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX512VL-NEXT: vpcmpeqb %ymm9, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm8, %ymm4 -; AVX512VL-NEXT: vpsrlw %xmm5, %ymm1, %ymm5 -; AVX512VL-NEXT: vpand %ymm7, %ymm5, %ymm5 -; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm6, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm7, %ymm4, %ymm4 +; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm6 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VL-NEXT: vpsrlw %xmm6, %ymm10, %ymm9 +; AVX512VL-NEXT: vpsrlw %xmm6, %xmm8, %xmm3 +; AVX512VL-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VL-NEXT: vpbroadcastb %xmm3, %ymm3 +; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm4, %ymm9 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm2, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm10, %ymm9, %ymm4 +; AVX512VL-NEXT: vpsllw %xmm5, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm7, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm6, %ymm1, %ymm5 +; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm0, %ymm5 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm5, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -873,21 +871,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 -; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 -; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512BW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512BW-NEXT: vpsllw %xmm3, %xmm4, %xmm3 ; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512BW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw %xmm4, %xmm5, %xmm4 -; AVX512BW-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512BW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpandq %zmm3, %zmm0, %zmm3 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 +; AVX512BW-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 ; AVX512BW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512BW-NEXT: retq @@ -896,21 +893,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VBMI2: # %bb.0: ; AVX512VBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 -; AVX512VBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 -; AVX512VBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VBMI2-NEXT: vpsllw %xmm3, %xmm4, %xmm3 ; AVX512VBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4 -; AVX512VBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512VBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VBMI2-NEXT: vpandq %zmm3, %zmm0, %zmm3 +; AVX512VBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 +; AVX512VBMI2-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 +; AVX512VBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VBMI2-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 ; AVX512VBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VBMI2-NEXT: retq @@ -919,21 +915,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 -; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VLBW-NEXT: vpsllw %xmm3, %xmm4, %xmm3 ; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLBW-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpsllw %xmm4, %xmm5, %xmm4 -; AVX512VLBW-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512VLBW-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpandq %zmm3, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 +; AVX512VLBW-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 ; AVX512VLBW-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLBW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLBW-NEXT: retq @@ -942,21 +937,20 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %y, <64 x i8> % ; AVX512VLVBMI2: # %bb.0: ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLVBMI2-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %zmm1, %zmm4 -; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VLVBMI2-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 -; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm3, %xmm3 +; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] +; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm3, %xmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 +; AVX512VLVBMI2-NEXT: vpsllw %xmm3, %xmm4, %xmm3 ; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} xmm4 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] -; AVX512VLVBMI2-NEXT: vpsubb %xmm2, %xmm4, %xmm4 -; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,zero,zero,zero,zero,xmm4[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vpsllw %xmm4, %xmm5, %xmm4 -; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm4, %zmm4 -; AVX512VLVBMI2-NEXT: vpandq %zmm4, %zmm0, %zmm0 -; AVX512VLVBMI2-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpandq %zmm3, %zmm0, %zmm3 +; AVX512VLVBMI2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm1, %zmm5 +; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %xmm4, %xmm0 +; AVX512VLVBMI2-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLVBMI2-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLVBMI2-NEXT: vpternlogq $236, %zmm5, %zmm3, %zmm0 ; AVX512VLVBMI2-NEXT: vptestnmb %zmm2, %zmm2, %k1 ; AVX512VLVBMI2-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} ; AVX512VLVBMI2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index bf7c057..e617cc0 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -487,12 +487,11 @@ define <32 x i8> @var_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_funnnel_v32i8: @@ -912,12 +911,11 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 -; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll index 3838dfd..9de5bdb 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-512.ll @@ -197,12 +197,11 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm5 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm8, %ymm5, %ymm5 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm10 -; AVX512VL-NEXT: vpor %ymm5, %ymm10, %ymm5 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm10 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpternlogq $248, %ymm10, %ymm5, %ymm8 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm8, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm5 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm5 @@ -216,11 +215,10 @@ define <64 x i8> @var_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind { ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm8, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpternlogq $248, %ymm10, %ymm3, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -455,60 +453,56 @@ define <64 x i8> @splatvar_funnnel_v64i8(<64 x i8> %x, <64 x i8> %amt) nounwind ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_funnnel_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4 +; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512BW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpsllw %xmm2, %zmm0, %zmm4 ; AVX512BW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512BW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 -; AVX512BW-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX512BW-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512BW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512BW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512BW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 -; AVX512BW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsllw %xmm2, %xmm5, %xmm2 +; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 +; AVX512BW-NEXT: vpandq %zmm2, %zmm4, %zmm2 +; AVX512BW-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsllw %xmm1, %xmm5, %xmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_funnnel_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm2 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] -; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm3 -; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,zero,zero,zero,zero,xmm3[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsrlw %xmm3, %zmm0, %zmm4 +; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vmovdqa {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7] +; AVX512VLBW-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,zero,zero,zero +; AVX512VLBW-NEXT: vpsllw %xmm2, %zmm0, %zmm4 ; AVX512VLBW-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5 -; AVX512VLBW-NEXT: vpsrlw %xmm3, %xmm5, %xmm3 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm3, %xmm3 -; AVX512VLBW-NEXT: vpbroadcastb %xmm3, %zmm3 -; AVX512VLBW-NEXT: vpandq %zmm3, %zmm4, %zmm3 -; AVX512VLBW-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX512VLBW-NEXT: vpsubb %xmm1, %xmm4, %xmm1 -; AVX512VLBW-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm5, %xmm2 +; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 +; AVX512VLBW-NEXT: vpandq %zmm2, %zmm4, %zmm2 +; AVX512VLBW-NEXT: vpand %xmm3, %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VLBW-NEXT: vpsllw %xmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpsllw %xmm1, %xmm5, %xmm1 -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm3, %zmm0, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm5, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %splat = shufflevector <64 x i8> %amt, <64 x i8> undef, <64 x i32> zeroinitializer %res = call <64 x i8> @llvm.fshr.v64i8(<64 x i8> %x, <64 x i8> %x, <64 x i8> %splat) diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll index 336311e..550cf38 100644 --- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll +++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-512.ll @@ -186,9 +186,8 @@ define <64 x i8> @test_div7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vpsrlw $2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxorq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm2, %zmm1 ; AVX512BW-NEXT: vpsrlw $7, %zmm0, %zmm0 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm0 @@ -541,9 +540,8 @@ define <64 x i8> @test_rem7_64i8(<64 x i8> %a) nounwind { ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm0, %zmm1, %zmm1 ; AVX512BW-NEXT: vpsrlw $2, %zmm1, %zmm2 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32] -; AVX512BW-NEXT: vpxorq %zmm3, %zmm2, %zmm2 +; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm3, %zmm2 ; AVX512BW-NEXT: vpsrlw $7, %zmm1, %zmm1 ; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 ; AVX512BW-NEXT: vpaddb %zmm1, %zmm2, %zmm1 diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index 3acdca7..f2eb1ae 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -2003,13 +2003,35 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind { ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v8i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $11, %xmm0, %xmm1 -; AVX512-NEXT: vpsllw $5, %xmm0, %xmm0 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: splatconstant_rotate_mask_v8i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlw $11, %xmm0, %xmm1 +; AVX512F-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512F-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $5, %xmm0, %xmm1 +; AVX512VL-NEXT: vpsrlw $11, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlw $11, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsllw $5, %xmm0, %xmm0 +; AVX512BW-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsllw $5, %xmm0, %xmm1 +; AVX512VLBW-NEXT: vpsrlw $11, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VLBW-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v8i16: ; XOP: # %bb.0: @@ -2055,14 +2077,39 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind { ; AVX-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v16i8: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsllw $4, %xmm0, %xmm1 -; AVX512-NEXT: vpsrlw $4, %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: splatconstant_rotate_mask_v16i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512F-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512F-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v16i8: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_mask_v16i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512BW-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm1 +; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i8: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsllw $4, %xmm0, %xmm1 +; AVX512VLBW-NEXT: vpsrlw $4, %xmm0, %xmm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %xmm1, %xmm0 +; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %xmm2, %xmm0 +; AVX512VLBW-NEXT: retq ; ; XOP-LABEL: splatconstant_rotate_mask_v16i8: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index df76a77..158dc3b 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -442,12 +442,11 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm2 -; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm3, %ymm3 -; AVX512VL-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm2 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm3 +; AVX512VL-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: var_rotate_v32i8: @@ -827,12 +826,11 @@ define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8] ; AVX512VL-NEXT: vpsubb %xmm1, %xmm3, %xmm1 ; AVX512VL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero -; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 -; AVX512VL-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VL-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 +; AVX512VL-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; AVX512VL-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm3, %ymm2, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatvar_rotate_v32i8: @@ -1713,13 +1711,35 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind { ; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX2-NEXT: retq ; -; AVX512-LABEL: splatconstant_rotate_mask_v16i16: -; AVX512: # %bb.0: -; AVX512-NEXT: vpsrlw $11, %ymm0, %ymm1 -; AVX512-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 -; AVX512-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512-NEXT: retq +; AVX512F-LABEL: splatconstant_rotate_mask_v16i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpsrlw $11, %ymm0, %ymm1 +; AVX512F-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm1 +; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpsrlw $11, %ymm0, %ymm1 +; AVX512BW-NEXT: vpsllw $5, %ymm0, %ymm0 +; AVX512BW-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpsllw $5, %ymm0, %ymm1 +; AVX512VLBW-NEXT: vpsrlw $11, %ymm0, %ymm0 +; AVX512VLBW-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLBW-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16: ; XOPAVX1: # %bb.0: @@ -1788,9 +1808,11 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_rotate_mask_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $216, {{.*}}(%rip), %ymm1, %ymm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: vpternlogq $168, {{.*}}(%rip), %ymm1, %ymm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i8: @@ -1805,10 +1827,9 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind { ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %ymm0, %ymm1 -; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 -; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm1 -; AVX512VLBW-NEXT: vpor %ymm0, %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VLBW-NEXT: vpand {{.*}}(%rip), %ymm1, %ymm0 +; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %ymm2, %ymm0 ; AVX512VLBW-NEXT: retq ; ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8: diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll index d92d73a..0b858e7 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-512.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll @@ -179,12 +179,11 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 ; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw $7, %ymm3, %ymm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm5 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] -; AVX512VL-NEXT: vpand %ymm5, %ymm4, %ymm4 -; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm8 -; AVX512VL-NEXT: vpor %ymm4, %ymm8, %ymm4 +; AVX512VL-NEXT: vpaddb %ymm3, %ymm3, %ymm5 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] +; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm4, %ymm5 ; AVX512VL-NEXT: vpaddb %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; AVX512VL-NEXT: vpblendvb %ymm2, %ymm5, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm4 ; AVX512VL-NEXT: vpternlogq $226, %ymm3, %ymm6, %ymm4 @@ -196,11 +195,10 @@ define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 ; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsrlw $7, %ymm0, %ymm3 -; AVX512VL-NEXT: vpand %ymm5, %ymm3, %ymm3 ; AVX512VL-NEXT: vpaddb %ymm0, %ymm0, %ymm4 -; AVX512VL-NEXT: vpor %ymm3, %ymm4, %ymm3 +; AVX512VL-NEXT: vpternlogq $248, %ymm8, %ymm3, %ymm4 ; AVX512VL-NEXT: vpaddb %ymm1, %ymm1, %ymm1 -; AVX512VL-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -425,13 +423,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VL-NEXT: vpsrlw %xmm1, %xmm5, %xmm5 ; AVX512VL-NEXT: vpsrlw $8, %xmm5, %xmm5 ; AVX512VL-NEXT: vpbroadcastb %xmm5, %ymm5 -; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 -; AVX512VL-NEXT: vpor %ymm2, %ymm4, %ymm2 +; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm4, %ymm2 ; AVX512VL-NEXT: vpsllw %xmm3, %ymm0, %ymm3 ; AVX512VL-NEXT: vpand %ymm6, %ymm3, %ymm3 ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpternlogq $236, %ymm5, %ymm3, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; @@ -446,12 +442,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW-NEXT: vpsllw %xmm2, %xmm4, %xmm2 ; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512BW-NEXT: vpandq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 -; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512BW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; AVX512BW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512BW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatvar_rotate_v64i8: @@ -465,12 +460,11 @@ define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512VLBW-NEXT: vpsllw %xmm2, %xmm4, %xmm2 ; AVX512VLBW-NEXT: vpbroadcastb %xmm2, %zmm2 ; AVX512VLBW-NEXT: vpandq %zmm2, %zmm3, %zmm2 -; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm1 -; AVX512VLBW-NEXT: vpsrlw $8, %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %zmm1 -; AVX512VLBW-NEXT: vpandq %zmm1, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vporq %zmm0, %zmm2, %zmm0 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 +; AVX512VLBW-NEXT: vpsrlw %xmm1, %xmm4, %xmm0 +; AVX512VLBW-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBW-NEXT: vpbroadcastb %xmm0, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $236, %zmm3, %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer %splat8 = sub <64 x i8> , %splat @@ -889,31 +883,27 @@ define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind { ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [55,55,55,55,55,55,55,55,55,55,55,55,55,55,55,55] ; AVX512VL-NEXT: vpsrlw $11, %ymm1, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm1, %ymm1 -; AVX512VL-NEXT: vpor %ymm3, %ymm1, %ymm1 -; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpternlogq $200, %ymm3, %ymm2, %ymm1 ; AVX512VL-NEXT: vpsrlw $11, %ymm0, %ymm3 ; AVX512VL-NEXT: vpsllw $5, %ymm0, %ymm0 -; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $200, %ymm3, %ymm2, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $5, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpsrlw $11, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $5, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $11, %zmm0, %zmm2 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %shl = shl <32 x i16> %a, %lshr = lshr <32 x i16> %a, @@ -948,34 +938,34 @@ define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm2 -; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] -; AVX512VL-NEXT: vpternlogq $226, %ymm2, %ymm3, %ymm1 -; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm2 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] -; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsllw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm4 = [39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39] +; AVX512VL-NEXT: vpternlogq $200, %ymm2, %ymm4, %ymm1 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpandn %ymm2, %ymm3, %ymm2 ; AVX512VL-NEXT: vpsllw $4, %ymm0, %ymm0 -; AVX512VL-NEXT: vpternlogq $226, %ymm4, %ymm3, %ymm0 -; AVX512VL-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $200, %ymm2, %ymm4, %ymm0 ; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512BW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm2 +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 +; AVX512BW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8: ; AVX512VLBW: # %bb.0: ; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm1 -; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 -; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm1 -; AVX512VLBW-NEXT: vporq %zmm0, %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm2 +; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm1, %zmm0 +; AVX512VLBW-NEXT: vpternlogq $248, {{.*}}(%rip), %zmm2, %zmm0 ; AVX512VLBW-NEXT: retq %shl = shl <64 x i8> %a, %lshr = lshr <64 x i8> %a, diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 8b30982..38c3488 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -1443,9 +1443,8 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v16i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll index 2f059a8..0d92b72 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -946,15 +946,14 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512DQVL: # %bb.0: ; AVX512DQVL-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpbroadcastb %xmm2, %ymm2 -; AVX512DQVL-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: vmovdqa {{.*#+}} ymm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] -; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm1 -; AVX512DQVL-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX512DQVL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; AVX512DQVL-NEXT: vpsrlw %xmm1, %ymm2, %ymm2 +; AVX512DQVL-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX512DQVL-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 +; AVX512DQVL-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpbroadcastb %xmm1, %ymm1 +; AVX512DQVL-NEXT: vpternlogq $108, %ymm1, %ymm2, %ymm0 +; AVX512DQVL-NEXT: vpsubb %ymm2, %ymm0, %ymm0 ; AVX512DQVL-NEXT: retq ; ; AVX512BWVL-LABEL: splatvar_shift_v32i8: @@ -1632,9 +1631,8 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v32i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %ymm1, %ymm0 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll index 1bb6297..b4e8e85 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-512.ll @@ -212,15 +212,14 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind { ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlw $8, %xmm2, %xmm2 -; AVX512BW-NEXT: vpbroadcastb %xmm2, %zmm2 -; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896,32896] -; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: vpxorq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsrlw %xmm1, %zmm2, %zmm2 +; AVX512BW-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 +; AVX512BW-NEXT: vpsrlw %xmm1, %xmm3, %xmm1 +; AVX512BW-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512BW-NEXT: vpbroadcastb %xmm1, %zmm1 +; AVX512BW-NEXT: vpternlogq $108, %zmm1, %zmm2, %zmm0 +; AVX512BW-NEXT: vpsubb %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer %shift = ashr <64 x i8> %a, %splat @@ -375,9 +374,8 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind { ; AVX512BW-LABEL: splatconstant_shift_v64i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 -; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512BW-NEXT: vpxorq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpternlogq $108, {{.*}}(%rip), %zmm1, %zmm0 ; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: retq %shift = ashr <64 x i8> %a, diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index 3f0345a..d410d49 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -2354,9 +2354,8 @@ define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v8i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2408,9 +2407,8 @@ define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v4i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; @@ -2462,9 +2460,8 @@ define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind { ; AVX512VL-LABEL: splatconstant_shift_v2i8: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpsrlw $3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] -; AVX512VL-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpternlogq $108, {{.*}}(%rip), %xmm1, %xmm0 ; AVX512VL-NEXT: vpsubb %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: retq ; -- 2.7.4