From 32857c54d2232fc91226381db9ecfe4e95d57da0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 14 Sep 2018 18:33:31 +0000 Subject: [PATCH] [X86][SSE] Lower shuffles to permute(unpack(x,y)) (PR31151) Attempt to lower a shuffle as an unpack of elements from two inputs followed by a single-input (wider) permutation. As long as the permutation is wider this is a win - there may be some circumstances where same size permutations would also be useful but I've left that for future work. Differential Revision: https://reviews.llvm.org/D52043 llvm-svn: 342257 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 80 ++++++++++++- llvm/test/CodeGen/X86/debuginfo-locations-dce.ll | 142 +++++++++++------------ llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll | 29 +---- llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll | 30 ++--- llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll | 20 +--- llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll | 3 +- 6 files changed, 165 insertions(+), 139 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index dae27c7..2208e41 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10030,6 +10030,72 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); } +/// Try to lower as an unpack of elements from two inputs followed by +/// a single-input permutation. +/// +/// This matches the pattern where we can unpack elements from two inputs and +/// then reduce the shuffle to a single-input (wider) permutation. +static SDValue lowerVectorShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + SelectionDAG &DAG) { + int NumElts = Mask.size(); + int NumLanes = VT.getSizeInBits() / 128; + int NumLaneElts = NumElts / NumLanes; + int NumHalfLaneElts = NumLaneElts / 2; + + bool MatchLo = true, MatchHi = true; + SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; + + // Determine UNPCKL/UNPCKH type and operand order. + for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { + for (int Elt = 0; Elt != NumLaneElts; ++Elt) { + int M = Mask[Lane + Elt]; + if (M < 0) + continue; + + SDValue &Op = Ops[Elt & 1]; + if (M < NumElts && (Op.isUndef() || Op == V1)) + Op = V1; + else if (NumElts <= M && (Op.isUndef() || Op == V2)) + Op = V2; + else + return SDValue(); + + int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts; + MatchLo &= isUndefOrInRange(M, Lo, Mid) || + isUndefOrInRange(M, NumElts + Lo, NumElts + Mid); + MatchHi &= isUndefOrInRange(M, Mid, Hi) || + isUndefOrInRange(M, NumElts + Mid, NumElts + Hi); + if (!MatchLo && !MatchHi) + return SDValue(); + } + } + assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI"); + + // Now check that each pair of elts come from the same unpack pair + // and set the permute mask based on each pair. + // TODO - Investigate cases where we permute individual elements. + SmallVector PermuteMask(NumElts, -1); + for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { + for (int Elt = 0; Elt != NumLaneElts; Elt += 2) { + int M0 = Mask[Lane + Elt + 0]; + int M1 = Mask[Lane + Elt + 1]; + if (0 <= M0 && 0 <= M1 && + (M0 % NumHalfLaneElts) != (M1 % NumHalfLaneElts)) + return SDValue(); + if (0 <= M0) + PermuteMask[Lane + Elt + 0] = Lane + (2 * (M0 % NumHalfLaneElts)); + if (0 <= M1) + PermuteMask[Lane + Elt + 1] = Lane + (2 * (M1 % NumHalfLaneElts)) + 1; + } + } + + unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; + SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops); + return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); +} + /// Generic routine to decompose a shuffle and blend into independent /// blends and permutes. /// @@ -10056,15 +10122,19 @@ static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL, BlendMask[i] = i + Size; } - // Try to lower with the simpler initial blend strategy unless one of the - // input shuffles would be a no-op. We prefer to shuffle inputs as the + // Try to lower with the simpler initial blend/unpack strategies unless one of + // the input shuffles would be a no-op. We prefer to shuffle inputs as the // shuffle may be able to fold with a load or other benefit. However, when - // we'll have to do 2x as many shuffles in order to achieve this, blending - // first is a better strategy. - if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) + // we'll have to do 2x as many shuffles in order to achieve this, + // blending/unpacking first is a better strategy. + if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG)) return BlendPerm; + if (SDValue UnpackPerm = + lowerVectorShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) + return UnpackPerm; + } V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); diff --git a/llvm/test/CodeGen/X86/debuginfo-locations-dce.ll b/llvm/test/CodeGen/X86/debuginfo-locations-dce.ll index 359d5aa..6e1884f 100755 --- a/llvm/test/CodeGen/X86/debuginfo-locations-dce.ll +++ b/llvm/test/CodeGen/X86/debuginfo-locations-dce.ll @@ -1,71 +1,71 @@ -; RUN: llc -O2 %s -o %t -filetype=obj -; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s - -; Check that Machine CSE correctly handles during the transformation, the -; debug location information for variables. - -; Generated with clang -c -g -O2 - -; typedef float __attribute__((__vector_size__(16))) f4; -; f4 get(); -; int main() { -; float MyVar = get()[0]; -; if (MyVar) -; return 1; -; } - -; ModuleID = 'test.cpp' -source_filename = "test.cpp" -target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" -target triple = "x86_64-pc-linux-gnu" - -define dso_local i32 @main() !dbg !7 { -entry: - %call = tail call <4 x float> @_Z3getv(), !dbg !14 - %vecext = extractelement <4 x float> %call, i32 0, !dbg !14 - call void @llvm.dbg.value(metadata float %vecext, metadata !12, metadata !DIExpression()), !dbg !15 - %tobool = fcmp une float %vecext, 0.000000e+00, !dbg !16 - %. = zext i1 %tobool to i32, !dbg !18 - ret i32 %., !dbg !19 -} - -declare dso_local <4 x float> @_Z3getv() - -declare void @llvm.dbg.value(metadata, metadata, metadata) #2 - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!3, !4, !5} -!llvm.ident = !{!6} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 339665)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) -!1 = !DIFile(filename: "test.cpp", directory: ".") -!2 = !{} -!3 = !{i32 2, !"Dwarf Version", i32 4} -!4 = !{i32 2, !"Debug Info Version", i32 3} -!5 = !{i32 1, !"wchar_size", i32 4} -!6 = !{!"clang version 8.0.0 (trunk 339665)"} -!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11) -!8 = !DISubroutineType(types: !9) -!9 = !{!10} -!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) -!11 = !{!12} -!12 = !DILocalVariable(name: "MyVar", scope: !7, file: !1, line: 4, type: !13) -!13 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) -!14 = !DILocation(line: 4, column: 18, scope: !7) -!15 = !DILocation(line: 4, column: 9, scope: !7) -!16 = !DILocation(line: 5, column: 7, scope: !17) -!17 = distinct !DILexicalBlock(scope: !7, file: !1, line: 5, column: 7) -!18 = !DILocation(line: 6, column: 5, scope: !17) -!19 = !DILocation(line: 7, column: 1, scope: !7) - -; Look at the debug location information for variable 'MyVar'. -; Verify that we see a sequence of DI entries, that looks like: -; DW_TAG_variable -; DW_AT_location (0x00000000 -; [0x0000000000000009, 0x0000000000000012): DW_OP_reg17 XMM0) -; DW_AT_name ("MyVar") - -; CHECK-LABEL: DW_TAG_variable -; CHECK-NEXT: DW_AT_location{{.*}} -; CHECK-NEXT: {{.*}}DW_OP_reg17 XMM0 -; CHECK-NEXT: DW_AT_name{{.*}}("MyVar") +; RUN: llc -O2 %s -o %t -filetype=obj +; RUN: llvm-dwarfdump -debug-info %t | FileCheck %s + +; Check that Machine CSE correctly handles during the transformation, the +; debug location information for variables. + +; Generated with clang -c -g -O2 + +; typedef float __attribute__((__vector_size__(16))) f4; +; f4 get(); +; int main() { +; float MyVar = get()[0]; +; if (MyVar) +; return 1; +; } + +; ModuleID = 'test.cpp' +source_filename = "test.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +define dso_local i32 @main() !dbg !7 { +entry: + %call = tail call <4 x float> @_Z3getv(), !dbg !14 + %vecext = extractelement <4 x float> %call, i32 0, !dbg !14 + call void @llvm.dbg.value(metadata float %vecext, metadata !12, metadata !DIExpression()), !dbg !15 + %tobool = fcmp une float %vecext, 0.000000e+00, !dbg !16 + %. = zext i1 %tobool to i32, !dbg !18 + ret i32 %., !dbg !19 +} + +declare dso_local <4 x float> @_Z3getv() + +declare void @llvm.dbg.value(metadata, metadata, metadata) #2 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 8.0.0 (trunk 339665)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "test.cpp", directory: ".") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 8.0.0 (trunk 339665)"} +!7 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !11) +!8 = !DISubroutineType(types: !9) +!9 = !{!10} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !{!12} +!12 = !DILocalVariable(name: "MyVar", scope: !7, file: !1, line: 4, type: !13) +!13 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float) +!14 = !DILocation(line: 4, column: 18, scope: !7) +!15 = !DILocation(line: 4, column: 9, scope: !7) +!16 = !DILocation(line: 5, column: 7, scope: !17) +!17 = distinct !DILexicalBlock(scope: !7, file: !1, line: 5, column: 7) +!18 = !DILocation(line: 6, column: 5, scope: !17) +!19 = !DILocation(line: 7, column: 1, scope: !7) + +; Look at the debug location information for variable 'MyVar'. +; Verify that we see a sequence of DI entries, that looks like: +; DW_TAG_variable +; DW_AT_location (0x00000000 +; [0x0000000000000009, 0x0000000000000012): DW_OP_reg17 XMM0) +; DW_AT_name ("MyVar") + +; CHECK-LABEL: DW_TAG_variable +; CHECK-NEXT: DW_AT_location{{.*}} +; CHECK-NEXT: {{.*}}DW_OP_reg17 XMM0 +; CHECK-NEXT: DW_AT_name{{.*}}("MyVar") diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index 4983737..6533557 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -978,30 +978,11 @@ define <16 x i8> @shuffle_v16i8_15_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30( ; PR31151 define <16 x i8> @shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23(<16 x i8> %val1, <16 x i8> %val2) { -; SSE2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: -; SSE2: # %bb.0: -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,1,3] -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: pandn %xmm2, %xmm1 -; SSE2-NEXT: por %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: -; SSSE3: # %bb.0: -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: -; SSE41: # %bb.0: -; SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; SSE41-NEXT: retq +; SSE-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: +; SSE: # %bb.0: +; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 1f44389..ab25ccb1 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -1019,20 +1019,11 @@ define <16 x i16> @shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_2 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: +; AVX2: # %bb.0: +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_00_16_00_16_00_16_00_16_08_24_08_24_08_24_08_24: ; AVX512VL: # %bb.0: @@ -4272,9 +4263,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_2 ; ; AVX2-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_00_16_01_17_10_26_11_27_08_24_09_25: @@ -4308,10 +4298,8 @@ define <16 x i16> @shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_2 ; ; AVX2-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,0,1,12,13,2,3,16,17,20,21,20,21,22,23,16,17,16,17,28,29,18,19] -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,2,3,6,7,6,7,0,1,2,3,2,3,14,15,20,21,18,19,22,23,22,23,16,17,18,19,18,19,30,31] -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,3,0,2] ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_02_18_03_19_10_26_11_27_00_16_01_17_08_24_09_25: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index b00d75d..159335f 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1326,40 +1326,28 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; ; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-SLOW-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX2-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX2-FAST-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-FAST-NEXT: retq ; ; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,3,4,5,6,7,8,8,10,11,12,13,14,15] ; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] -; AVX512VLBW-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-SLOW-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VLBW-SLOW-NEXT: kmovd %eax, %k1 -; AVX512VLBW-SLOW-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VLBW-SLOW-NEXT: retq ; ; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,16,17,16,17,16,17,16,17,16,17,16,17,16,17,16,17] -; AVX512VLBW-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VLBW-FAST-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA -; AVX512VLBW-FAST-NEXT: kmovd %eax, %k1 -; AVX512VLBW-FAST-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VLBW-FAST-NEXT: retq ; ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll index 79e47da..2abf0ef 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -1293,9 +1293,8 @@ define <8 x i32> @shuffle_v8i32_08084c4c(<8 x i32> %a, <8 x i32> %b) { ; ; AVX2OR512VL-LABEL: shuffle_v8i32_08084c4c: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,2,0,4,4,6,4] +; AVX2OR512VL-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; AVX2OR512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX2OR512VL-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> ret <8 x i32> %shuffle -- 2.7.4