From 70562607ab385423e076b3c9851860291201e509 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 15 Mar 2023 09:56:22 +0000 Subject: [PATCH] [DAG] Fold multiple insert_vector_elt of zero values into an AND mask This also allows us to make use of the existing isVectorClearMaskLegal shuffle canonicalization Differential Revision: https://reviews.llvm.org/D145939 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 + llvm/test/CodeGen/AArch64/neon-dotreduce.ll | 1593 ++++++++++---------- .../CodeGen/AArch64/vecreduce-add-legalization.ll | 10 +- .../CodeGen/AArch64/vecreduce-umax-legalization.ll | 10 +- .../CodeGen/X86/clear_upper_vector_element_bits.ll | 324 +--- llvm/test/CodeGen/X86/insertelement-zero.ll | 108 +- 6 files changed, 844 insertions(+), 1215 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 7ab4384..3d45c3e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20982,6 +20982,20 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return NewShuffle; } + // If all insertions are zero value, try to convert to AND mask. + // TODO: Do this for -1 with OR mask? + if (!LegalOperations && llvm::isNullConstant(InVal) && + all_of(Ops, [InVal](SDValue Op) { return !Op || Op == InVal; }) && + count_if(Ops, [InVal](SDValue Op) { return Op == InVal; }) >= 2) { + SDValue Zero = DAG.getConstant(0, DL, MaxEltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, MaxEltVT); + SmallVector Mask(NumElts); + for (unsigned I = 0; I != NumElts; ++I) + Mask[I] = Ops[I] ? Zero : AllOnes; + return DAG.getNode(ISD::AND, DL, VT, CurVec, + DAG.getBuildVector(VT, DL, Mask)); + } + // Failed to find a match in the chain - bail. break; } diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll index 78d169d..1853779 100644 --- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll +++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll @@ -130,16 +130,15 @@ entry: define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v5i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h -; CHECK-NEXT: mov v2.s[1], wzr -; CHECK-NEXT: mov v2.s[2], wzr -; CHECK-NEXT: mov v2.s[3], wzr -; CHECK-NEXT: umlal v2.4s, v0.4h, v1.4h -; CHECK-NEXT: addv s0, v2.4s +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: umull2 v3.4s, v1.8h, v2.8h +; CHECK-NEXT: mov v0.s[0], v3.s[0] +; CHECK-NEXT: umlal v0.4s, v1.4h, v2.4h +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -157,13 +156,12 @@ entry: define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v5i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d0, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-NEXT: mov v1.s[1], wzr -; CHECK-NEXT: mov v1.s[2], wzr -; CHECK-NEXT: mov v1.s[3], wzr -; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: ldr d1, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0 +; CHECK-NEXT: mov v0.s[0], v2.s[0] +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -176,16 +174,15 @@ entry: define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v5i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr d0, [x1] -; CHECK-NEXT: ldr d1, [x0] -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr d1, [x1] +; CHECK-NEXT: ldr d2, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h -; CHECK-NEXT: mov v2.s[1], wzr -; CHECK-NEXT: mov v2.s[2], wzr -; CHECK-NEXT: mov v2.s[3], wzr -; CHECK-NEXT: smlal v2.4s, v0.4h, v1.4h -; CHECK-NEXT: addv s0, v2.4s +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: smull2 v3.4s, v1.8h, v2.8h +; CHECK-NEXT: mov v0.s[0], v3.s[0] +; CHECK-NEXT: smlal v0.4s, v1.4h, v2.4h +; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 ; CHECK-NEXT: ret @@ -207,17 +204,15 @@ define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: smull2 v4.4s, v0.8h, v1.8h -; CHECK-NEXT: smull2 v5.4s, v2.8h, v3.8h -; CHECK-NEXT: mov v4.s[1], wzr -; CHECK-NEXT: mov v5.s[1], wzr -; CHECK-NEXT: mov v4.s[2], wzr -; CHECK-NEXT: mov v5.s[2], wzr -; CHECK-NEXT: mov v4.s[3], wzr -; CHECK-NEXT: mov v5.s[3], wzr -; CHECK-NEXT: smlal v4.4s, v0.4h, v1.4h -; CHECK-NEXT: smlal v5.4s, v2.4h, v3.4h -; CHECK-NEXT: add v0.4s, v4.4s, v5.4s +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: smull2 v5.4s, v0.8h, v1.8h +; CHECK-NEXT: movi v6.2d, #0000000000000000 +; CHECK-NEXT: smull2 v7.4s, v2.8h, v3.8h +; CHECK-NEXT: mov v6.s[0], v5.s[0] +; CHECK-NEXT: mov v4.s[0], v7.s[0] +; CHECK-NEXT: smlal v6.4s, v0.4h, v1.4h +; CHECK-NEXT: smlal v4.4s, v2.4h, v3.4h +; CHECK-NEXT: add v0.4s, v6.4s, v4.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -237,18 +232,16 @@ entry: define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) { ; CHECK-LABEL: test_sdot_v5i8_double_nomla: ; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v1.8h, v2.8b, #0 -; CHECK-NEXT: sshll2 v2.4s, v0.8h, #0 -; CHECK-NEXT: sshll2 v3.4s, v1.8h, #0 -; CHECK-NEXT: mov v2.s[1], wzr -; CHECK-NEXT: mov v3.s[1], wzr -; CHECK-NEXT: mov v2.s[2], wzr -; CHECK-NEXT: mov v3.s[2], wzr -; CHECK-NEXT: mov v2.s[3], wzr -; CHECK-NEXT: mov v3.s[3], wzr -; CHECK-NEXT: saddw v0.4s, v2.4s, v0.4h -; CHECK-NEXT: saddw v1.4s, v3.4s, v1.4h +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: sshll2 v4.4s, v0.8h, #0 +; CHECK-NEXT: sshll2 v5.4s, v2.8h, #0 +; CHECK-NEXT: mov v3.s[0], v4.s[0] +; CHECK-NEXT: mov v1.s[0], v5.s[0] +; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h +; CHECK-NEXT: saddw v1.4s, v1.4s, v2.4h ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -1005,28 +998,27 @@ entry: define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q3, q0, [x1] -; CHECK-NEXT: ushll v6.8h, v3.8b, #0 -; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 -; CHECK-NEXT: ldp q2, q1, [x0] -; CHECK-NEXT: ushll2 v5.8h, v0.16b, #0 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v1.16b, #0 -; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: umull v4.4s, v5.4h, v4.4h -; CHECK-NEXT: ushll2 v5.8h, v2.16b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: mov v4.s[1], wzr -; CHECK-NEXT: umull2 v7.4s, v6.8h, v2.8h -; CHECK-NEXT: umull v2.4s, v6.4h, v2.4h -; CHECK-NEXT: mov v4.s[2], wzr -; CHECK-NEXT: umlal2 v7.4s, v0.8h, v1.8h -; CHECK-NEXT: umlal v2.4s, v0.4h, v1.4h -; CHECK-NEXT: mov v4.s[3], wzr -; CHECK-NEXT: umlal2 v7.4s, v3.8h, v5.8h -; CHECK-NEXT: umlal v4.4s, v3.4h, v5.4h -; CHECK-NEXT: add v0.4s, v2.4s, v7.4s -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ldp q1, q4, [x1] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: ushll2 v7.8h, v4.16b, #0 +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ushll2 v6.8h, v3.16b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll v5.8h, v2.8b, #0 +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: umull v2.4s, v7.4h, v2.4h +; CHECK-NEXT: ushll v7.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 +; CHECK-NEXT: umull2 v16.4s, v7.8h, v3.8h +; CHECK-NEXT: mov v0.s[0], v2.s[0] +; CHECK-NEXT: umull v2.4s, v7.4h, v3.4h +; CHECK-NEXT: umlal2 v16.4s, v4.8h, v5.8h +; CHECK-NEXT: umlal v0.4s, v1.4h, v6.4h +; CHECK-NEXT: umlal v2.4s, v4.4h, v5.4h +; CHECK-NEXT: umlal2 v16.4s, v1.8h, v6.8h +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v16.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1045,21 +1037,20 @@ entry: define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v25i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q1, q0, [x0] -; CHECK-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-NEXT: ushll2 v2.8h, v0.16b, #0 -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v2.4s, v2.4h, #0 -; CHECK-NEXT: uaddl2 v4.4s, v3.8h, v0.8h -; CHECK-NEXT: mov v2.s[1], wzr -; CHECK-NEXT: uaddl v0.4s, v3.4h, v0.4h -; CHECK-NEXT: uaddw2 v3.4s, v4.4s, v1.8h -; CHECK-NEXT: mov v2.s[2], wzr -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-NEXT: mov v2.s[3], wzr -; CHECK-NEXT: uaddw v1.4s, v2.4s, v1.4h -; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ldp q1, q2, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ushll2 v3.8h, v1.16b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v4.8h, v2.16b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v4.4s, v4.4h, #0 +; CHECK-NEXT: uaddl2 v5.4s, v1.8h, v2.8h +; CHECK-NEXT: mov v0.s[0], v4.s[0] +; CHECK-NEXT: uaddl v1.4s, v1.4h, v2.4h +; CHECK-NEXT: uaddw2 v2.4s, v5.4s, v3.8h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v3.4h +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret @@ -1072,28 +1063,27 @@ entry: define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v25i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldp q3, q0, [x1] -; CHECK-NEXT: sshll v6.8h, v3.8b, #0 -; CHECK-NEXT: sshll2 v3.8h, v3.16b, #0 -; CHECK-NEXT: ldp q2, q1, [x0] -; CHECK-NEXT: sshll2 v5.8h, v0.16b, #0 -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll2 v4.8h, v1.16b, #0 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: smull v4.4s, v5.4h, v4.4h -; CHECK-NEXT: sshll2 v5.8h, v2.16b, #0 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: mov v4.s[1], wzr -; CHECK-NEXT: smull2 v7.4s, v6.8h, v2.8h -; CHECK-NEXT: smull v2.4s, v6.4h, v2.4h -; CHECK-NEXT: mov v4.s[2], wzr -; CHECK-NEXT: smlal2 v7.4s, v0.8h, v1.8h -; CHECK-NEXT: smlal v2.4s, v0.4h, v1.4h -; CHECK-NEXT: mov v4.s[3], wzr -; CHECK-NEXT: smlal2 v7.4s, v3.8h, v5.8h -; CHECK-NEXT: smlal v4.4s, v3.4h, v5.4h -; CHECK-NEXT: add v0.4s, v2.4s, v7.4s -; CHECK-NEXT: add v0.4s, v0.4s, v4.4s +; CHECK-NEXT: ldp q1, q4, [x1] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: sshll2 v7.8h, v4.16b, #0 +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll2 v6.8h, v3.16b, #0 +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: sshll v5.8h, v2.8b, #0 +; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: smull v2.4s, v7.4h, v2.4h +; CHECK-NEXT: sshll v7.8h, v1.8b, #0 +; CHECK-NEXT: sshll2 v1.8h, v1.16b, #0 +; CHECK-NEXT: smull2 v16.4s, v7.8h, v3.8h +; CHECK-NEXT: mov v0.s[0], v2.s[0] +; CHECK-NEXT: smull v2.4s, v7.4h, v3.4h +; CHECK-NEXT: smlal2 v16.4s, v4.8h, v5.8h +; CHECK-NEXT: smlal v0.4s, v1.4h, v6.4h +; CHECK-NEXT: smlal v2.4s, v4.4h, v5.4h +; CHECK-NEXT: smlal2 v16.4s, v1.8h, v6.8h +; CHECK-NEXT: add v0.4s, v2.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v16.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: add w0, w8, w2 @@ -1118,220 +1108,218 @@ define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 ; CHECK-NEXT: ldr b2, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b0, [sp, #16] -; CHECK-NEXT: add x9, sp, #96 -; CHECK-NEXT: ldr b4, [sp, #280] -; CHECK-NEXT: add x10, sp, #288 +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: add x10, sp, #40 +; CHECK-NEXT: add x11, sp, #128 ; CHECK-NEXT: ld1 { v2.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #24 -; CHECK-NEXT: add x11, sp, #304 -; CHECK-NEXT: add x12, sp, #56 -; CHECK-NEXT: ld1 { v4.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #296 -; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #104 -; CHECK-NEXT: ld1 { v2.b }[2], [x9] +; CHECK-NEXT: add x8, sp, #96 +; CHECK-NEXT: ld1 { v0.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ldr b1, [sp, #216] -; CHECK-NEXT: fmov s6, w0 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #128 +; CHECK-NEXT: ldr b17, [sp, #152] +; CHECK-NEXT: fmov s4, w0 +; CHECK-NEXT: ldr b6, [sp, #280] +; CHECK-NEXT: add x12, sp, #224 +; CHECK-NEXT: ld1 { v2.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v0.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #112 +; CHECK-NEXT: ldr b1, [sp, #216] +; CHECK-NEXT: mov v4.b[1], w1 +; CHECK-NEXT: ldr b3, [sp, #480] ; CHECK-NEXT: ld1 { v2.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #40 -; CHECK-NEXT: ldr b16, [sp, #152] -; CHECK-NEXT: ld1 { v4.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #224 -; CHECK-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #120 +; CHECK-NEXT: ld1 { v0.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: ld1 { v1.b }[1], [x12] +; CHECK-NEXT: mov v4.b[2], w2 +; CHECK-NEXT: ldr b18, [sp, #352] ; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: ld1 { v1.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #328 -; CHECK-NEXT: ldr b3, [sp, #480] -; CHECK-NEXT: ld1 { v0.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: add x9, sp, #56 +; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #288 +; CHECK-NEXT: ldr b20, [sp, #680] +; CHECK-NEXT: mov v4.b[3], w3 +; CHECK-NEXT: ldr b5, [sp, #144] ; CHECK-NEXT: ld1 { v2.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: ld1 { v6.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #296 +; CHECK-NEXT: ld1 { v17.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: mov v6.b[1], w1 -; CHECK-NEXT: ldr b7, [sp, #352] -; CHECK-NEXT: ldr b19, [sp, #552] -; CHECK-NEXT: ld1 { v0.b }[5], [x12] -; CHECK-NEXT: add x12, sp, #72 -; CHECK-NEXT: ld1 { v2.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #160 -; CHECK-NEXT: mov v6.b[2], w2 -; CHECK-NEXT: ldr b5, [sp, #144] -; CHECK-NEXT: ldr b17, [sp, #344] +; CHECK-NEXT: ld1 { v2.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #320 ; CHECK-NEXT: ld1 { v0.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #232 -; CHECK-NEXT: ld1 { v16.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #312 +; CHECK-NEXT: add x9, sp, #72 +; CHECK-NEXT: ld1 { v6.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #304 +; CHECK-NEXT: mov v4.b[4], w4 ; CHECK-NEXT: ld1 { v2.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #168 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #240 -; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #320 -; CHECK-NEXT: ld1 { v16.b }[2], [x8] +; CHECK-NEXT: ld1 { v0.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #232 +; CHECK-NEXT: ld1 { v6.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #312 +; CHECK-NEXT: ld1 { v17.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: mov v6.b[3], w3 -; CHECK-NEXT: ldr b20, [sp, #544] -; CHECK-NEXT: ld1 { v1.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #248 -; CHECK-NEXT: ld1 { v4.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #488 -; CHECK-NEXT: ld1 { v16.b }[3], [x8] +; CHECK-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #488 +; CHECK-NEXT: mov v4.b[5], w5 +; CHECK-NEXT: ld1 { v6.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #240 +; CHECK-NEXT: ld1 { v3.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #496 +; CHECK-NEXT: ld1 { v17.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #184 -; CHECK-NEXT: mov v6.b[4], w4 -; CHECK-NEXT: ld1 { v0.b }[7], [x12] -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #256 -; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #496 -; CHECK-NEXT: ld1 { v16.b }[4], [x8] +; CHECK-NEXT: ld1 { v1.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #248 +; CHECK-NEXT: mov v4.b[6], w6 +; CHECK-NEXT: ld1 { v6.b }[5], [x11] +; CHECK-NEXT: ld1 { v3.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #504 +; CHECK-NEXT: ld1 { v17.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: mov v6.b[5], w5 -; CHECK-NEXT: ld1 { v4.b }[6], [x11] -; CHECK-NEXT: ld1 { v1.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #264 -; CHECK-NEXT: ld1 { v3.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #504 -; CHECK-NEXT: ld1 { v16.b }[5], [x8] +; CHECK-NEXT: ld1 { v1.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #256 +; CHECK-NEXT: add x11, sp, #328 +; CHECK-NEXT: ld1 { v3.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #512 +; CHECK-NEXT: ld1 { v17.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #200 -; CHECK-NEXT: mov v6.b[6], w6 +; CHECK-NEXT: ld1 { v1.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #264 +; CHECK-NEXT: mov v4.b[7], w7 +; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: ld1 { v3.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #520 +; CHECK-NEXT: ld1 { v17.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #208 +; CHECK-NEXT: ld1 { v1.b }[6], [x10] ; CHECK-NEXT: add x11, sp, #336 -; CHECK-NEXT: ld1 { v1.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #272 -; CHECK-NEXT: ld1 { v3.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: ld1 { v16.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #512 -; CHECK-NEXT: mov v6.b[7], w7 -; CHECK-NEXT: ld1 { v4.b }[7], [x11] -; CHECK-NEXT: ld1 { v1.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #208 -; CHECK-NEXT: ld1 { v3.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: add x10, sp, #272 +; CHECK-NEXT: ld1 { v3.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #536 +; CHECK-NEXT: ld1 { v17.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #528 +; CHECK-NEXT: sshll v19.8h, v4.8b, #0 +; CHECK-NEXT: ldr b4, [sp, #416] +; CHECK-NEXT: ld1 { v6.b }[7], [x11] ; CHECK-NEXT: add x11, sp, #688 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #360 -; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #368 -; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #560 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #424 +; CHECK-NEXT: ld1 { v1.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #360 +; CHECK-NEXT: sshll v7.8h, v2.8b, #0 +; CHECK-NEXT: ldr b2, [sp, #344] +; CHECK-NEXT: ld1 { v4.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #432 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0 +; CHECK-NEXT: ld1 { v18.b }[1], [x10] +; CHECK-NEXT: sshll v16.8h, v6.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: sshll v6.8h, v2.8b, #0 +; CHECK-NEXT: add x9, sp, #560 +; CHECK-NEXT: smull v2.4s, v19.4h, v17.4h +; CHECK-NEXT: ld1 { v4.b }[2], [x8] +; CHECK-NEXT: smull2 v17.4s, v19.8h, v17.8h +; CHECK-NEXT: ldr b19, [sp, #552] +; CHECK-NEXT: add x8, sp, #368 +; CHECK-NEXT: add x10, sp, #440 +; CHECK-NEXT: ld1 { v20.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #696 ; CHECK-NEXT: ld1 { v19.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #376 -; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: ld1 { v18.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: smull2 v18.4s, v6.8h, v16.8h -; CHECK-NEXT: ld1 { v3.b }[6], [x10] -; CHECK-NEXT: smull v6.4s, v6.4h, v16.4h -; CHECK-NEXT: ldr b16, [sp, #416] +; CHECK-NEXT: ld1 { v4.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #448 +; CHECK-NEXT: ld1 { v20.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #704 ; CHECK-NEXT: ld1 { v19.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #576 -; CHECK-NEXT: ld1 { v7.b }[3], [x9] +; CHECK-NEXT: ld1 { v18.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #384 -; CHECK-NEXT: add x10, sp, #424 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: smlal v2.4s, v7.4h, v16.4h +; CHECK-NEXT: ld1 { v4.b }[4], [x10] +; CHECK-NEXT: smlal2 v17.4s, v7.8h, v16.8h +; CHECK-NEXT: ldr b7, [sp, #616] ; CHECK-NEXT: ld1 { v19.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #584 -; CHECK-NEXT: ld1 { v7.b }[4], [x9] +; CHECK-NEXT: ld1 { v18.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #392 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[1], [x10] -; CHECK-NEXT: smull v5.4s, v5.4h, v17.4h -; CHECK-NEXT: ldr b17, [sp, #680] +; CHECK-NEXT: add x10, sp, #456 +; CHECK-NEXT: ld1 { v20.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #400 ; CHECK-NEXT: ld1 { v19.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #592 -; CHECK-NEXT: ld1 { v7.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #400 -; CHECK-NEXT: add x10, sp, #432 -; CHECK-NEXT: ld1 { v17.b }[1], [x11] -; CHECK-NEXT: smlal v6.4s, v2.4h, v4.4h -; CHECK-NEXT: add x11, sp, #696 +; CHECK-NEXT: ld1 { v18.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #624 +; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: ld1 { v7.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #632 ; CHECK-NEXT: ld1 { v19.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #600 -; CHECK-NEXT: ld1 { v7.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #408 -; CHECK-NEXT: smlal2 v18.4s, v2.8h, v4.8h -; CHECK-NEXT: ldr b2, [sp, #616] -; CHECK-NEXT: ld1 { v16.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #440 +; CHECK-NEXT: ld1 { v20.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #720 +; CHECK-NEXT: ld1 { v18.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #408 +; CHECK-NEXT: ld1 { v7.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #640 ; CHECK-NEXT: ld1 { v19.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #608 -; CHECK-NEXT: ld1 { v7.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #624 -; CHECK-NEXT: ld1 { v17.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #536 -; CHECK-NEXT: ld1 { v16.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #704 -; CHECK-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #632 -; CHECK-NEXT: ld1 { v19.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #448 -; CHECK-NEXT: ld1 { v17.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #712 -; CHECK-NEXT: sshll v4.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[7], [x11] -; CHECK-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #640 -; CHECK-NEXT: sshll v7.8h, v19.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[4], [x8] -; CHECK-NEXT: ld1 { v17.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #720 -; CHECK-NEXT: smull2 v19.4s, v4.8h, v7.8h -; CHECK-NEXT: add x8, sp, #456 -; CHECK-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #648 -; CHECK-NEXT: smull v4.4s, v4.4h, v7.4h -; CHECK-NEXT: ldr b7, [sp, #744] -; CHECK-NEXT: sshll v20.8h, v20.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[5], [x10] +; CHECK-NEXT: ld1 { v20.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #728 -; CHECK-NEXT: ld1 { v16.b }[5], [x8] -; CHECK-NEXT: sshll v7.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #656 -; CHECK-NEXT: add x8, sp, #464 -; CHECK-NEXT: smull v7.4s, v20.4h, v7.4h -; CHECK-NEXT: ld1 { v17.b }[6], [x10] -; CHECK-NEXT: mov v5.s[1], wzr -; CHECK-NEXT: add x10, sp, #736 -; CHECK-NEXT: ld1 { v2.b }[5], [x9] +; CHECK-NEXT: ld1 { v18.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #464 +; CHECK-NEXT: ld1 { v7.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #664 -; CHECK-NEXT: ld1 { v16.b }[6], [x8] +; CHECK-NEXT: ld1 { v19.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #648 +; CHECK-NEXT: ld1 { v20.b }[6], [x10] +; CHECK-NEXT: add x10, sp, #736 +; CHECK-NEXT: sshll v16.8h, v18.8b, #0 +; CHECK-NEXT: ld1 { v4.b }[6], [x11] +; CHECK-NEXT: ld1 { v7.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #656 +; CHECK-NEXT: sshll v18.8h, v19.8b, #0 +; CHECK-NEXT: ld1 { v20.b }[7], [x10] +; CHECK-NEXT: smull v19.4s, v16.4h, v18.4h +; CHECK-NEXT: ld1 { v7.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #472 -; CHECK-NEXT: mov v7.s[1], wzr -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: mov v5.s[2], wzr -; CHECK-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #672 -; CHECK-NEXT: ld1 { v16.b }[7], [x8] -; CHECK-NEXT: mov v7.s[2], wzr +; CHECK-NEXT: smull2 v16.4s, v16.8h, v18.8h +; CHECK-NEXT: ldr b18, [sp, #544] +; CHECK-NEXT: smull v5.4s, v5.4h, v6.4h +; CHECK-NEXT: ldr b6, [sp, #744] ; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: mov v5.s[3], wzr -; CHECK-NEXT: mov v7.s[3], wzr -; CHECK-NEXT: smlal v4.4s, v3.4h, v17.4h -; CHECK-NEXT: smlal2 v19.4s, v3.8h, v17.8h +; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 +; CHECK-NEXT: ld1 { v7.b }[6], [x9] +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: add x9, sp, #672 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: smlal v19.4s, v3.4h, v20.4h +; CHECK-NEXT: smlal2 v16.4s, v3.8h, v20.8h +; CHECK-NEXT: ld1 { v7.b }[7], [x9] +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: smull v6.4s, v18.4h, v6.4h +; CHECK-NEXT: movi v18.2d, #0000000000000000 ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: mov v3.s[0], v5.s[0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: sshll v3.8h, v16.8b, #0 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: smlal v5.4s, v0.4h, v1.4h -; CHECK-NEXT: smlal v7.4s, v3.4h, v2.4h -; CHECK-NEXT: smlal2 v18.4s, v0.8h, v1.8h -; CHECK-NEXT: smlal2 v19.4s, v3.8h, v2.8h -; CHECK-NEXT: add v0.4s, v6.4s, v5.4s -; CHECK-NEXT: add v1.4s, v4.4s, v7.4s -; CHECK-NEXT: add v0.4s, v0.4s, v18.4s -; CHECK-NEXT: add v1.4s, v1.4s, v19.4s +; CHECK-NEXT: mov v18.s[0], v6.s[0] +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll v5.8h, v7.8b, #0 +; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h +; CHECK-NEXT: smlal v18.4s, v4.4h, v5.4h +; CHECK-NEXT: smlal2 v17.4s, v0.8h, v1.8h +; CHECK-NEXT: smlal2 v16.4s, v4.8h, v5.8h +; CHECK-NEXT: add v0.4s, v2.4s, v3.4s +; CHECK-NEXT: add v1.4s, v19.4s, v18.4s +; CHECK-NEXT: add v0.4s, v0.4s, v17.4s +; CHECK-NEXT: add v1.4s, v1.4s, v16.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -1360,115 +1348,113 @@ define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> % ; CHECK-NEXT: add x8, sp, #88 ; CHECK-NEXT: ldr b2, [sp, #16] ; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: fmov s1, w0 ; CHECK-NEXT: ldr b3, [sp, #480] -; CHECK-NEXT: add x10, sp, #488 ; CHECK-NEXT: ld1 { v0.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #96 ; CHECK-NEXT: ld1 { v2.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: add x11, sp, #120 -; CHECK-NEXT: ld1 { v3.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #496 -; CHECK-NEXT: add x12, sp, #48 +; CHECK-NEXT: mov v1.b[1], w1 +; CHECK-NEXT: add x10, sp, #488 +; CHECK-NEXT: add x11, sp, #496 +; CHECK-NEXT: ldr b4, [sp, #352] ; CHECK-NEXT: ld1 { v0.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #104 ; CHECK-NEXT: ld1 { v2.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #40 -; CHECK-NEXT: ldr b4, [sp, #144] -; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: ld1 { v3.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #64 +; CHECK-NEXT: ld1 { v3.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: mov v1.b[2], w2 +; CHECK-NEXT: ldr b6, [sp, #416] ; CHECK-NEXT: ld1 { v0.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #112 ; CHECK-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #136 -; CHECK-NEXT: sshll v4.8h, v4.8b, #0 -; CHECK-NEXT: ldr b6, [sp, #416] -; CHECK-NEXT: mov v1.b[1], w1 -; CHECK-NEXT: ldr b16, [sp, #544] +; CHECK-NEXT: add x9, sp, #128 +; CHECK-NEXT: ld1 { v3.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #56 +; CHECK-NEXT: mov v1.b[3], w3 +; CHECK-NEXT: add x12, sp, #504 ; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #128 -; CHECK-NEXT: ld1 { v2.b }[4], [x12] -; CHECK-NEXT: add x12, sp, #56 -; CHECK-NEXT: sshll v5.4s, v4.4h, #0 -; CHECK-NEXT: ldr b4, [sp, #352] -; CHECK-NEXT: mov v1.b[2], w2 -; CHECK-NEXT: ld1 { v0.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #504 -; CHECK-NEXT: ld1 { v2.b }[5], [x12] -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 -; CHECK-NEXT: ld1 { v3.b }[3], [x11] +; CHECK-NEXT: add x8, sp, #120 +; CHECK-NEXT: ld1 { v2.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #64 +; CHECK-NEXT: ldr b5, [sp, #144] +; CHECK-NEXT: mov v1.b[4], w4 +; CHECK-NEXT: ld1 { v3.b }[3], [x12] +; CHECK-NEXT: ld1 { v0.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #136 +; CHECK-NEXT: ld1 { v2.b }[5], [x11] ; CHECK-NEXT: add x11, sp, #360 -; CHECK-NEXT: ld1 { v0.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #512 +; CHECK-NEXT: add x12, sp, #72 +; CHECK-NEXT: mov v1.b[5], w5 +; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #424 +; CHECK-NEXT: ld1 { v4.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #512 ; CHECK-NEXT: ld1 { v2.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #368 -; CHECK-NEXT: ld1 { v4.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #432 -; CHECK-NEXT: ld1 { v3.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #520 -; CHECK-NEXT: ld1 { v0.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #424 -; CHECK-NEXT: mov v1.b[3], w3 -; CHECK-NEXT: ld1 { v4.b }[2], [x10] -; CHECK-NEXT: add x10, sp, #376 ; CHECK-NEXT: ld1 { v6.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #72 -; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #528 -; CHECK-NEXT: mov v1.b[4], w4 -; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #440 -; CHECK-NEXT: ld1 { v6.b }[2], [x11] -; CHECK-NEXT: ld1 { v4.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #384 -; CHECK-NEXT: ld1 { v3.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #392 -; CHECK-NEXT: mov v1.b[5], w5 -; CHECK-NEXT: ld1 { v6.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #448 -; CHECK-NEXT: ld1 { v4.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #456 -; CHECK-NEXT: sshll v16.4s, v16.4h, #0 +; CHECK-NEXT: add x9, sp, #376 +; CHECK-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #432 +; CHECK-NEXT: ld1 { v4.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #520 ; CHECK-NEXT: mov v1.b[6], w6 -; CHECK-NEXT: ld1 { v6.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #536 -; CHECK-NEXT: ld1 { v4.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #400 -; CHECK-NEXT: mov v5.s[1], wzr -; CHECK-NEXT: mov v16.s[1], wzr -; CHECK-NEXT: ld1 { v3.b }[7], [x9] -; CHECK-NEXT: ld1 { v6.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #464 -; CHECK-NEXT: ld1 { v4.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #408 +; CHECK-NEXT: ld1 { v2.b }[7], [x12] +; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #440 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: ld1 { v3.b }[4], [x11] +; CHECK-NEXT: ld1 { v4.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #384 +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #448 ; CHECK-NEXT: mov v1.b[7], w7 -; CHECK-NEXT: add x9, sp, #472 -; CHECK-NEXT: mov v5.s[2], wzr -; CHECK-NEXT: ld1 { v6.b }[6], [x10] -; CHECK-NEXT: mov v16.s[2], wzr -; CHECK-NEXT: ld1 { v4.b }[7], [x8] +; CHECK-NEXT: ld1 { v3.b }[5], [x10] +; CHECK-NEXT: sshll v5.4s, v5.4h, #0 +; CHECK-NEXT: ld1 { v4.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #392 +; CHECK-NEXT: add x10, sp, #528 +; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #456 +; CHECK-NEXT: mov v7.s[0], v5.s[0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v4.b }[5], [x9] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v6.b }[7], [x9] -; CHECK-NEXT: mov v5.s[3], wzr -; CHECK-NEXT: mov v16.s[3], wzr +; CHECK-NEXT: add x9, sp, #400 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: ld1 { v6.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #464 +; CHECK-NEXT: ld1 { v3.b }[6], [x10] +; CHECK-NEXT: saddw v5.4s, v7.4s, v2.4h +; CHECK-NEXT: ld1 { v4.b }[6], [x9] ; CHECK-NEXT: saddl v7.4s, v1.4h, v0.4h +; CHECK-NEXT: add x10, sp, #536 +; CHECK-NEXT: add x9, sp, #408 +; CHECK-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #472 +; CHECK-NEXT: add v5.4s, v7.4s, v5.4s +; CHECK-NEXT: ldr b7, [sp, #544] ; CHECK-NEXT: saddl2 v0.4s, v1.8h, v0.8h -; CHECK-NEXT: sshll v1.8h, v3.8b, #0 -; CHECK-NEXT: sshll v3.8h, v4.8b, #0 -; CHECK-NEXT: sshll v2.8h, v2.8b, #0 -; CHECK-NEXT: sshll v4.8h, v6.8b, #0 -; CHECK-NEXT: saddl v6.4s, v3.4h, v1.4h -; CHECK-NEXT: saddl2 v1.4s, v3.8h, v1.8h -; CHECK-NEXT: saddw v5.4s, v5.4s, v2.4h -; CHECK-NEXT: saddw v3.4s, v16.4s, v4.4h +; CHECK-NEXT: ld1 { v3.b }[7], [x10] +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: ld1 { v4.b }[7], [x9] +; CHECK-NEXT: sshll v7.8h, v7.8b, #0 +; CHECK-NEXT: ld1 { v6.b }[7], [x8] ; CHECK-NEXT: saddw2 v0.4s, v0.4s, v2.8h -; CHECK-NEXT: saddw2 v1.4s, v1.4s, v4.8h -; CHECK-NEXT: add v5.4s, v7.4s, v5.4s -; CHECK-NEXT: add v2.4s, v6.4s, v3.4s +; CHECK-NEXT: sshll v7.4s, v7.4h, #0 +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: mov v1.s[0], v7.s[0] +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: saddl v7.4s, v4.4h, v3.4h +; CHECK-NEXT: saddl2 v3.4s, v4.8h, v3.8h +; CHECK-NEXT: saddw v1.4s, v1.4s, v6.4h ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: saddw2 v2.4s, v3.4s, v6.8h +; CHECK-NEXT: add v1.4s, v7.4s, v1.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -1601,34 +1587,33 @@ entry: define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_udot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x1, #32] -; CHECK-NEXT: ldr b1, [x0, #32] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr b1, [x1, #32] +; CHECK-NEXT: ldr b2, [x0, #32] +; CHECK-NEXT: ldp q3, q4, [x0] ; CHECK-NEXT: ushll v1.8h, v1.8b, #0 -; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: ushll v5.8h, v2.8b, #0 -; CHECK-NEXT: ushll2 v4.8h, v3.16b, #0 -; CHECK-NEXT: ushll v1.8h, v3.8b, #0 -; CHECK-NEXT: mov v0.s[1], wzr -; CHECK-NEXT: ldp q3, q6, [x1] -; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 -; CHECK-NEXT: mov v0.s[2], wzr -; CHECK-NEXT: ushll2 v16.8h, v3.16b, #0 -; CHECK-NEXT: ushll v3.8h, v3.8b, #0 -; CHECK-NEXT: umull2 v17.4s, v16.8h, v2.8h -; CHECK-NEXT: umull2 v18.4s, v3.8h, v5.8h -; CHECK-NEXT: mov v0.s[3], wzr -; CHECK-NEXT: ushll2 v7.8h, v6.16b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: ushll v16.8h, v3.8b, #0 +; CHECK-NEXT: ldp q5, q6, [x1] +; CHECK-NEXT: ushll2 v3.8h, v3.16b, #0 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ushll2 v7.8h, v4.16b, #0 +; CHECK-NEXT: ushll2 v2.8h, v5.16b, #0 +; CHECK-NEXT: ushll v5.8h, v5.8b, #0 +; CHECK-NEXT: umull2 v18.4s, v2.8h, v3.8h +; CHECK-NEXT: umull2 v1.4s, v5.8h, v16.8h +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: ushll2 v17.8h, v6.16b, #0 ; CHECK-NEXT: ushll v6.8h, v6.8b, #0 -; CHECK-NEXT: umull v2.4s, v16.4h, v2.4h -; CHECK-NEXT: umlal2 v17.4s, v7.8h, v4.8h -; CHECK-NEXT: umlal2 v18.4s, v6.8h, v1.8h -; CHECK-NEXT: umlal v0.4s, v3.4h, v5.4h -; CHECK-NEXT: umlal v2.4s, v7.4h, v4.4h -; CHECK-NEXT: add v3.4s, v18.4s, v17.4s -; CHECK-NEXT: umlal v0.4s, v6.4h, v1.4h -; CHECK-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h +; CHECK-NEXT: umlal2 v18.4s, v17.8h, v7.8h +; CHECK-NEXT: umlal2 v1.4s, v6.8h, v4.8h +; CHECK-NEXT: umlal v0.4s, v5.4h, v16.4h +; CHECK-NEXT: umlal v2.4s, v17.4h, v7.4h +; CHECK-NEXT: add v1.4s, v1.4s, v18.4s +; CHECK-NEXT: umlal v0.4s, v6.4h, v4.4h +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1648,23 +1633,22 @@ entry: define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) { ; CHECK-LABEL: test_udot_v33i8_nomla: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x0, #32] -; CHECK-NEXT: ldp q2, q1, [x0] -; CHECK-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-NEXT: ushll2 v4.8h, v2.16b, #0 -; CHECK-NEXT: mov v0.s[1], wzr -; CHECK-NEXT: ushll v3.8h, v1.8b, #0 -; CHECK-NEXT: ushll2 v1.8h, v1.16b, #0 -; CHECK-NEXT: ushll v2.8h, v2.8b, #0 -; CHECK-NEXT: mov v0.s[2], wzr -; CHECK-NEXT: uaddl2 v5.4s, v4.8h, v1.8h -; CHECK-NEXT: uaddl2 v6.4s, v2.8h, v3.8h -; CHECK-NEXT: uaddl v1.4s, v4.4h, v1.4h -; CHECK-NEXT: mov v0.s[3], wzr -; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h -; CHECK-NEXT: add v2.4s, v6.4s, v5.4s -; CHECK-NEXT: uaddw v0.4s, v0.4s, v3.4h +; CHECK-NEXT: ldr b1, [x0, #32] +; CHECK-NEXT: ldp q3, q2, [x0] +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v1.4s, v1.4h, #0 +; CHECK-NEXT: ushll2 v5.8h, v3.16b, #0 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: ushll v4.8h, v2.8b, #0 +; CHECK-NEXT: ushll2 v2.8h, v2.16b, #0 +; CHECK-NEXT: ushll v1.8h, v3.8b, #0 +; CHECK-NEXT: uaddl2 v3.4s, v5.8h, v2.8h +; CHECK-NEXT: uaddl2 v6.4s, v1.8h, v4.8h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: uaddl v1.4s, v5.4h, v2.4h +; CHECK-NEXT: add v2.4s, v6.4s, v3.4s +; CHECK-NEXT: uaddw v0.4s, v0.4s, v4.4h ; CHECK-NEXT: add v1.4s, v1.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s @@ -1679,34 +1663,33 @@ entry: define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) { ; CHECK-LABEL: test_sdot_v33i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldr b0, [x1, #32] -; CHECK-NEXT: ldr b1, [x0, #32] -; CHECK-NEXT: ldp q2, q3, [x0] -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ldr b1, [x1, #32] +; CHECK-NEXT: ldr b2, [x0, #32] +; CHECK-NEXT: ldp q3, q4, [x0] ; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-NEXT: sshll v5.8h, v2.8b, #0 -; CHECK-NEXT: sshll2 v4.8h, v3.16b, #0 -; CHECK-NEXT: sshll v1.8h, v3.8b, #0 -; CHECK-NEXT: mov v0.s[1], wzr -; CHECK-NEXT: ldp q3, q6, [x1] -; CHECK-NEXT: sshll2 v2.8h, v2.16b, #0 -; CHECK-NEXT: mov v0.s[2], wzr -; CHECK-NEXT: sshll2 v16.8h, v3.16b, #0 -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: smull2 v17.4s, v16.8h, v2.8h -; CHECK-NEXT: smull2 v18.4s, v3.8h, v5.8h -; CHECK-NEXT: mov v0.s[3], wzr -; CHECK-NEXT: sshll2 v7.8h, v6.16b, #0 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: smull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: sshll v16.8h, v3.8b, #0 +; CHECK-NEXT: ldp q5, q6, [x1] +; CHECK-NEXT: sshll2 v3.8h, v3.16b, #0 +; CHECK-NEXT: mov v0.s[0], v1.s[0] +; CHECK-NEXT: sshll2 v7.8h, v4.16b, #0 +; CHECK-NEXT: sshll2 v2.8h, v5.16b, #0 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 +; CHECK-NEXT: smull2 v18.4s, v2.8h, v3.8h +; CHECK-NEXT: smull2 v1.4s, v5.8h, v16.8h +; CHECK-NEXT: sshll v4.8h, v4.8b, #0 +; CHECK-NEXT: sshll2 v17.8h, v6.16b, #0 ; CHECK-NEXT: sshll v6.8h, v6.8b, #0 -; CHECK-NEXT: smull v2.4s, v16.4h, v2.4h -; CHECK-NEXT: smlal2 v17.4s, v7.8h, v4.8h -; CHECK-NEXT: smlal2 v18.4s, v6.8h, v1.8h -; CHECK-NEXT: smlal v0.4s, v3.4h, v5.4h -; CHECK-NEXT: smlal v2.4s, v7.4h, v4.4h -; CHECK-NEXT: add v3.4s, v18.4s, v17.4s -; CHECK-NEXT: smlal v0.4s, v6.4h, v1.4h -; CHECK-NEXT: add v1.4s, v2.4s, v3.4s +; CHECK-NEXT: smull v2.4s, v2.4h, v3.4h +; CHECK-NEXT: smlal2 v18.4s, v17.8h, v7.8h +; CHECK-NEXT: smlal2 v1.4s, v6.8h, v4.8h +; CHECK-NEXT: smlal v0.4s, v5.4h, v16.4h +; CHECK-NEXT: smlal v2.4s, v17.4h, v7.4h +; CHECK-NEXT: add v1.4s, v1.4s, v18.4s +; CHECK-NEXT: smlal v0.4s, v6.4h, v4.4h +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w8, s0 @@ -1731,291 +1714,289 @@ define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 +; CHECK-NEXT: ldr b1, [sp, #144] +; CHECK-NEXT: add x9, sp, #96 ; CHECK-NEXT: ldr b3, [sp, #16] -; CHECK-NEXT: add x9, sp, #152 -; CHECK-NEXT: ldr b2, [sp, #144] ; CHECK-NEXT: add x10, sp, #104 ; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #96 +; CHECK-NEXT: add x8, sp, #152 ; CHECK-NEXT: ldr b4, [sp, #344] -; CHECK-NEXT: add x11, sp, #368 -; CHECK-NEXT: ld1 { v2.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #160 -; CHECK-NEXT: ldr b7, [sp, #216] -; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: ld1 { v0.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #24 +; CHECK-NEXT: fmov s2, w0 +; CHECK-NEXT: ldr b6, [sp, #216] +; CHECK-NEXT: add x11, sp, #136 +; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #160 +; CHECK-NEXT: ld1 { v0.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: mov v2.b[1], w1 ; CHECK-NEXT: ldr b17, [sp, #280] -; CHECK-NEXT: ld1 { v2.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #168 -; CHECK-NEXT: ld1 { v3.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #32 +; CHECK-NEXT: ldr b7, [sp, #408] +; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #168 +; CHECK-NEXT: ld1 { v3.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #32 ; CHECK-NEXT: ld1 { v0.b }[3], [x10] ; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: ldr b16, [sp, #408] -; CHECK-NEXT: ld1 { v2.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #176 -; CHECK-NEXT: ld1 { v3.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: mov v2.b[2], w2 +; CHECK-NEXT: ldr b5, [sp, #208] +; CHECK-NEXT: ld1 { v1.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #176 +; CHECK-NEXT: ld1 { v3.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #40 ; CHECK-NEXT: ld1 { v0.b }[4], [x10] ; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: ldr b5, [sp, #208] -; CHECK-NEXT: ld1 { v2.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #184 -; CHECK-NEXT: ld1 { v3.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #48 +; CHECK-NEXT: mov v2.b[3], w3 +; CHECK-NEXT: ld1 { v1.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #184 +; CHECK-NEXT: ld1 { v3.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #128 ; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #128 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: ldr b20, [sp, #872] -; CHECK-NEXT: ld1 { v2.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #192 -; CHECK-NEXT: ld1 { v3.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #56 -; CHECK-NEXT: ld1 { v0.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #64 -; CHECK-NEXT: mov v1.b[1], w1 -; CHECK-NEXT: ldr b21, [sp, #744] -; CHECK-NEXT: ld1 { v2.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #200 -; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #136 -; CHECK-NEXT: mov v1.b[2], w2 -; CHECK-NEXT: ld1 { v0.b }[7], [x8] -; CHECK-NEXT: add x8, sp, #72 -; CHECK-NEXT: ld1 { v3.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #288 -; CHECK-NEXT: ld1 { v2.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #224 -; CHECK-NEXT: mov v1.b[3], w3 -; CHECK-NEXT: ld1 { v17.b }[1], [x10] -; CHECK-NEXT: add x10, sp, #240 -; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: add x10, sp, #48 +; CHECK-NEXT: mov v2.b[4], w4 +; CHECK-NEXT: ld1 { v1.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #192 +; CHECK-NEXT: ld1 { v3.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #360 +; CHECK-NEXT: ld1 { v0.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #56 +; CHECK-NEXT: mov v2.b[5], w5 +; CHECK-NEXT: ld1 { v1.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #200 +; CHECK-NEXT: ld1 { v3.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #64 +; CHECK-NEXT: ld1 { v0.b }[7], [x11] +; CHECK-NEXT: add x11, sp, #232 +; CHECK-NEXT: mov v2.b[6], w6 +; CHECK-NEXT: ld1 { v1.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #352 -; CHECK-NEXT: ld1 { v7.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #416 -; CHECK-NEXT: sshll v19.8h, v2.8b, #0 -; CHECK-NEXT: ldr b2, [sp, #472] +; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #72 +; CHECK-NEXT: sshll v5.8h, v5.8b, #0 ; CHECK-NEXT: ld1 { v4.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #360 -; CHECK-NEXT: ld1 { v16.b }[1], [x9] +; CHECK-NEXT: add x8, sp, #224 +; CHECK-NEXT: mov v2.b[7], w7 +; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #416 +; CHECK-NEXT: ld1 { v6.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #288 +; CHECK-NEXT: ld1 { v4.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #368 +; CHECK-NEXT: ld1 { v7.b }[1], [x9] ; CHECK-NEXT: add x9, sp, #424 -; CHECK-NEXT: sshll v3.8h, v3.8b, #0 -; CHECK-NEXT: mov v1.b[4], w4 -; CHECK-NEXT: ld1 { v4.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #232 -; CHECK-NEXT: ld1 { v16.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #432 -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: ld1 { v7.b }[2], [x8] +; CHECK-NEXT: ld1 { v17.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #296 -; CHECK-NEXT: ld1 { v4.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #376 -; CHECK-NEXT: ld1 { v16.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #440 +; CHECK-NEXT: ld1 { v6.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #240 +; CHECK-NEXT: ld1 { v4.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #376 +; CHECK-NEXT: ld1 { v7.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #432 ; CHECK-NEXT: ld1 { v17.b }[2], [x8] ; CHECK-NEXT: add x8, sp, #304 -; CHECK-NEXT: ld1 { v7.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #248 -; CHECK-NEXT: ld1 { v4.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #384 -; CHECK-NEXT: ld1 { v16.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #448 +; CHECK-NEXT: ld1 { v6.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #248 +; CHECK-NEXT: ld1 { v4.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #384 +; CHECK-NEXT: ld1 { v7.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #440 ; CHECK-NEXT: ld1 { v17.b }[3], [x8] ; CHECK-NEXT: add x8, sp, #312 -; CHECK-NEXT: ld1 { v7.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #256 -; CHECK-NEXT: ld1 { v4.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #392 -; CHECK-NEXT: ld1 { v16.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #456 +; CHECK-NEXT: ld1 { v6.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #256 +; CHECK-NEXT: ld1 { v4.b }[5], [x10] +; CHECK-NEXT: add x10, sp, #392 +; CHECK-NEXT: ld1 { v7.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #448 ; CHECK-NEXT: ld1 { v17.b }[4], [x8] ; CHECK-NEXT: add x8, sp, #320 -; CHECK-NEXT: ld1 { v7.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #264 -; CHECK-NEXT: ld1 { v4.b }[6], [x11] -; CHECK-NEXT: add x11, sp, #400 -; CHECK-NEXT: ld1 { v16.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #464 +; CHECK-NEXT: ld1 { v6.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #264 +; CHECK-NEXT: sshll v19.8h, v2.8b, #0 +; CHECK-NEXT: ld1 { v4.b }[6], [x10] +; CHECK-NEXT: ld1 { v7.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #456 ; CHECK-NEXT: ld1 { v17.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #328 -; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #272 -; CHECK-NEXT: ld1 { v4.b }[7], [x11] -; CHECK-NEXT: add x11, sp, #496 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #488 +; CHECK-NEXT: ld1 { v6.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #272 +; CHECK-NEXT: sshll v2.8h, v1.8b, #0 +; CHECK-NEXT: ldr b1, [sp, #608] +; CHECK-NEXT: ld1 { v7.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #464 ; CHECK-NEXT: ld1 { v17.b }[6], [x8] ; CHECK-NEXT: add x8, sp, #336 -; CHECK-NEXT: ld1 { v7.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #640 -; CHECK-NEXT: sshll v6.8h, v4.8b, #0 -; CHECK-NEXT: sshll v16.8h, v16.8b, #0 +; CHECK-NEXT: ld1 { v6.b }[7], [x11] +; CHECK-NEXT: add x10, sp, #400 +; CHECK-NEXT: sshll v16.8h, v3.8b, #0 +; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: ld1 { v7.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #624 ; CHECK-NEXT: ld1 { v17.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: sshll v18.8h, v7.8b, #0 -; CHECK-NEXT: ldr b7, [sp, #608] -; CHECK-NEXT: mov v1.b[5], w5 -; CHECK-NEXT: ld1 { v7.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #624 -; CHECK-NEXT: sshll v4.8h, v17.8b, #0 -; CHECK-NEXT: sshll v17.8h, v2.8b, #0 -; CHECK-NEXT: smull2 v2.4s, v3.8h, v4.8h -; CHECK-NEXT: smull v3.4s, v3.4h, v4.4h -; CHECK-NEXT: ld1 { v7.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #632 -; CHECK-NEXT: smull v4.4s, v5.4h, v17.4h -; CHECK-NEXT: ldr b17, [sp, #544] -; CHECK-NEXT: smlal2 v2.4s, v19.8h, v16.8h -; CHECK-NEXT: smlal v3.4s, v19.4h, v16.4h -; CHECK-NEXT: ldr b16, [sp, #480] -; CHECK-NEXT: ld1 { v7.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #552 -; CHECK-NEXT: ldr b19, [sp, #672] -; CHECK-NEXT: ld1 { v16.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #680 +; CHECK-NEXT: sshll v21.8h, v6.8b, #0 +; CHECK-NEXT: ldr b6, [sp, #472] +; CHECK-NEXT: ld1 { v4.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #552 +; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #488 +; CHECK-NEXT: sshll v18.8h, v17.8b, #0 +; CHECK-NEXT: ldr b17, [sp, #480] +; CHECK-NEXT: sshll v6.8h, v6.8b, #0 +; CHECK-NEXT: sshll v3.8h, v4.8b, #0 ; CHECK-NEXT: ld1 { v17.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #560 -; CHECK-NEXT: ld1 { v7.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #648 -; CHECK-NEXT: ld1 { v19.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #688 -; CHECK-NEXT: ld1 { v16.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #504 +; CHECK-NEXT: add x8, sp, #496 +; CHECK-NEXT: ld1 { v1.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #632 +; CHECK-NEXT: sshll v4.8h, v7.8b, #0 +; CHECK-NEXT: smull v20.4s, v5.4h, v6.4h +; CHECK-NEXT: movi v7.2d, #0000000000000000 ; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #568 -; CHECK-NEXT: ld1 { v7.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #656 -; CHECK-NEXT: ld1 { v19.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #696 -; CHECK-NEXT: ld1 { v16.b }[3], [x11] -; CHECK-NEXT: add x11, sp, #512 +; CHECK-NEXT: smull v5.4s, v16.4h, v18.4h +; CHECK-NEXT: ld1 { v1.b }[3], [x9] +; CHECK-NEXT: smull2 v16.4s, v16.8h, v18.8h +; CHECK-NEXT: ldr b18, [sp, #544] +; CHECK-NEXT: add x8, sp, #504 +; CHECK-NEXT: add x9, sp, #640 +; CHECK-NEXT: mov v7.s[0], v20.s[0] +; CHECK-NEXT: ldr b20, [sp, #672] +; CHECK-NEXT: ld1 { v18.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #680 ; CHECK-NEXT: ld1 { v17.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #576 -; CHECK-NEXT: mov v4.s[1], wzr -; CHECK-NEXT: ld1 { v7.b }[6], [x10] -; CHECK-NEXT: ld1 { v19.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #704 -; CHECK-NEXT: ld1 { v16.b }[4], [x11] -; CHECK-NEXT: add x11, sp, #520 -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #584 -; CHECK-NEXT: mov v1.b[6], w6 -; CHECK-NEXT: add x10, sp, #664 -; CHECK-NEXT: ld1 { v19.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #712 -; CHECK-NEXT: mov v4.s[2], wzr -; CHECK-NEXT: ld1 { v16.b }[5], [x11] -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #880 -; CHECK-NEXT: mov v1.b[7], w7 -; CHECK-NEXT: ld1 { v7.b }[7], [x10] -; CHECK-NEXT: ld1 { v19.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #720 -; CHECK-NEXT: ld1 { v20.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #560 +; CHECK-NEXT: ld1 { v1.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #512 +; CHECK-NEXT: ld1 { v20.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #520 +; CHECK-NEXT: ld1 { v18.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #688 +; CHECK-NEXT: ld1 { v17.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #568 +; CHECK-NEXT: smull2 v6.4s, v19.8h, v21.8h +; CHECK-NEXT: ld1 { v1.b }[5], [x11] +; CHECK-NEXT: ld1 { v20.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #696 +; CHECK-NEXT: ld1 { v18.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #576 +; CHECK-NEXT: ld1 { v17.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #528 -; CHECK-NEXT: add x8, sp, #888 -; CHECK-NEXT: add x11, sp, #592 -; CHECK-NEXT: mov v4.s[3], wzr -; CHECK-NEXT: ld1 { v19.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #728 -; CHECK-NEXT: ld1 { v16.b }[6], [x10] +; CHECK-NEXT: smlal v7.4s, v19.4h, v21.4h +; CHECK-NEXT: ldr b19, [sp, #872] +; CHECK-NEXT: ld1 { v20.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #704 +; CHECK-NEXT: ld1 { v18.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #584 +; CHECK-NEXT: ld1 { v17.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #536 -; CHECK-NEXT: ld1 { v20.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #752 -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 -; CHECK-NEXT: ld1 { v17.b }[6], [x11] -; CHECK-NEXT: ld1 { v19.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #944 -; CHECK-NEXT: smull2 v5.4s, v1.8h, v18.8h -; CHECK-NEXT: ld1 { v21.b }[1], [x8] -; CHECK-NEXT: smlal v4.4s, v1.4h, v18.4h -; CHECK-NEXT: ldr b1, [sp, #936] -; CHECK-NEXT: add x11, sp, #600 -; CHECK-NEXT: ld1 { v16.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #896 -; CHECK-NEXT: add x8, sp, #760 -; CHECK-NEXT: ld1 { v1.b }[1], [x9] +; CHECK-NEXT: ldr b21, [sp, #936] +; CHECK-NEXT: add x11, sp, #656 +; CHECK-NEXT: ld1 { v20.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #712 +; CHECK-NEXT: ld1 { v18.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #592 +; CHECK-NEXT: ld1 { v17.b }[7], [x10] +; CHECK-NEXT: add x10, sp, #880 +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: ld1 { v20.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #600 +; CHECK-NEXT: ld1 { v18.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #720 +; CHECK-NEXT: ld1 { v19.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #944 +; CHECK-NEXT: smlal2 v6.4s, v0.8h, v3.8h +; CHECK-NEXT: add x11, sp, #664 +; CHECK-NEXT: ld1 { v20.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #888 +; CHECK-NEXT: ld1 { v18.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #728 +; CHECK-NEXT: ld1 { v21.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #752 +; CHECK-NEXT: ld1 { v19.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #952 -; CHECK-NEXT: ld1 { v17.b }[7], [x11] +; CHECK-NEXT: ld1 { v20.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #896 +; CHECK-NEXT: smlal v7.4s, v0.4h, v3.4h +; CHECK-NEXT: ldr b0, [sp, #744] +; CHECK-NEXT: ld1 { v21.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #904 +; CHECK-NEXT: ld1 { v19.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #960 +; CHECK-NEXT: ld1 { v0.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #760 +; CHECK-NEXT: ld1 { v1.b }[7], [x11] ; CHECK-NEXT: add x11, sp, #816 -; CHECK-NEXT: smlal2 v5.4s, v0.8h, v6.8h -; CHECK-NEXT: ld1 { v20.b }[3], [x10] -; CHECK-NEXT: smlal v4.4s, v0.4h, v6.4h -; CHECK-NEXT: ld1 { v21.b }[2], [x8] -; CHECK-NEXT: ldr b0, [sp, #808] -; CHECK-NEXT: add x10, sp, #904 -; CHECK-NEXT: add x8, sp, #768 -; CHECK-NEXT: ld1 { v1.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #960 -; CHECK-NEXT: ldr b18, [sp, #736] -; CHECK-NEXT: ld1 { v0.b }[1], [x11] -; CHECK-NEXT: add x11, sp, #824 -; CHECK-NEXT: ld1 { v20.b }[4], [x10] -; CHECK-NEXT: add x10, sp, #912 ; CHECK-NEXT: ld1 { v21.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #776 -; CHECK-NEXT: ld1 { v1.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #968 -; CHECK-NEXT: ld1 { v0.b }[2], [x11] -; CHECK-NEXT: add x11, sp, #832 -; CHECK-NEXT: ld1 { v20.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #920 +; CHECK-NEXT: add x8, sp, #968 +; CHECK-NEXT: ldr b3, [sp, #808] +; CHECK-NEXT: ld1 { v19.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #912 +; CHECK-NEXT: ld1 { v0.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #768 +; CHECK-NEXT: ld1 { v3.b }[1], [x11] +; CHECK-NEXT: add x11, sp, #824 ; CHECK-NEXT: ld1 { v21.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #784 -; CHECK-NEXT: sshll v6.8h, v7.8b, #0 -; CHECK-NEXT: ld1 { v1.b }[4], [x9] -; CHECK-NEXT: sshll v7.8h, v16.8b, #0 -; CHECK-NEXT: ld1 { v0.b }[3], [x11] -; CHECK-NEXT: sshll v16.8h, v18.8b, #0 -; CHECK-NEXT: ldr b18, [sp, #1000] -; CHECK-NEXT: ld1 { v20.b }[6], [x10] -; CHECK-NEXT: add x9, sp, #976 +; CHECK-NEXT: add x8, sp, #976 +; CHECK-NEXT: ld1 { v19.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #920 +; CHECK-NEXT: ld1 { v0.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #776 +; CHECK-NEXT: ld1 { v3.b }[2], [x11] +; CHECK-NEXT: add x11, sp, #832 ; CHECK-NEXT: ld1 { v21.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #840 -; CHECK-NEXT: sshll v18.8h, v18.8b, #0 -; CHECK-NEXT: add x10, sp, #928 -; CHECK-NEXT: ld1 { v1.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #792 -; CHECK-NEXT: smull v16.4s, v16.4h, v18.4h -; CHECK-NEXT: ld1 { v0.b }[4], [x8] -; CHECK-NEXT: ld1 { v20.b }[7], [x10] -; CHECK-NEXT: add x10, sp, #984 -; CHECK-NEXT: add x8, sp, #848 -; CHECK-NEXT: ld1 { v21.b }[6], [x9] -; CHECK-NEXT: add x9, sp, #800 -; CHECK-NEXT: mov v16.s[1], wzr -; CHECK-NEXT: ld1 { v1.b }[6], [x10] -; CHECK-NEXT: ld1 { v0.b }[5], [x8] -; CHECK-NEXT: add x10, sp, #992 -; CHECK-NEXT: add x8, sp, #856 -; CHECK-NEXT: ld1 { v21.b }[7], [x9] -; CHECK-NEXT: sshll v18.8h, v19.8b, #0 -; CHECK-NEXT: mov v16.s[2], wzr -; CHECK-NEXT: ld1 { v1.b }[7], [x10] +; CHECK-NEXT: add x8, sp, #984 +; CHECK-NEXT: ld1 { v19.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #928 +; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #848 +; CHECK-NEXT: ld1 { v3.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #840 +; CHECK-NEXT: ld1 { v21.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #992 +; CHECK-NEXT: ld1 { v19.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #784 +; CHECK-NEXT: smlal2 v16.4s, v2.8h, v4.8h +; CHECK-NEXT: ld1 { v3.b }[4], [x11] +; CHECK-NEXT: ld1 { v21.b }[7], [x8] +; CHECK-NEXT: add x8, sp, #792 +; CHECK-NEXT: ld1 { v0.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #856 +; CHECK-NEXT: smlal v5.4s, v2.4h, v4.4h +; CHECK-NEXT: ldr b2, [sp, #736] +; CHECK-NEXT: sshll v4.8h, v20.8b, #0 +; CHECK-NEXT: ldr b20, [sp, #1000] +; CHECK-NEXT: ld1 { v3.b }[5], [x10] +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: ld1 { v0.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #864 -; CHECK-NEXT: sshll v19.8h, v20.8b, #0 +; CHECK-NEXT: sshll v20.8h, v20.8b, #0 +; CHECK-NEXT: add x8, sp, #800 ; CHECK-NEXT: sshll v21.8h, v21.8b, #0 -; CHECK-NEXT: mov v16.s[3], wzr -; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: smull v2.4s, v2.4h, v20.4h +; CHECK-NEXT: ld1 { v3.b }[6], [x9] +; CHECK-NEXT: smull v20.4s, v4.4h, v21.4h ; CHECK-NEXT: ld1 { v0.b }[7], [x8] +; CHECK-NEXT: smull2 v4.4s, v4.8h, v21.8h +; CHECK-NEXT: add x9, sp, #864 +; CHECK-NEXT: movi v21.2d, #0000000000000000 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: sshll v17.8h, v17.8b, #0 -; CHECK-NEXT: smull v20.4s, v18.4h, v1.4h -; CHECK-NEXT: smull2 v1.4s, v18.8h, v1.8h -; CHECK-NEXT: smull2 v18.4s, v6.8h, v19.8h -; CHECK-NEXT: smlal v16.4s, v7.4h, v21.4h +; CHECK-NEXT: ld1 { v3.b }[7], [x9] +; CHECK-NEXT: sshll v19.8h, v19.8b, #0 +; CHECK-NEXT: mov v21.s[0], v2.s[0] ; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: smlal2 v18.4s, v7.8h, v21.8h -; CHECK-NEXT: smlal2 v1.4s, v17.8h, v0.8h -; CHECK-NEXT: smlal v20.4s, v17.4h, v0.4h -; CHECK-NEXT: smlal v16.4s, v6.4h, v19.4h -; CHECK-NEXT: add v0.4s, v5.4s, v2.4s -; CHECK-NEXT: add v2.4s, v4.4s, v3.4s -; CHECK-NEXT: add v1.4s, v18.4s, v1.4s -; CHECK-NEXT: add v3.4s, v16.4s, v20.4s -; CHECK-NEXT: add v0.4s, v2.4s, v0.4s -; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: smull2 v2.4s, v1.8h, v19.8h +; CHECK-NEXT: sshll v18.8h, v18.8b, #0 +; CHECK-NEXT: smlal v21.4s, v17.4h, v0.4h +; CHECK-NEXT: sshll v3.8h, v3.8b, #0 +; CHECK-NEXT: smlal2 v2.4s, v17.8h, v0.8h +; CHECK-NEXT: smlal2 v4.4s, v18.8h, v3.8h +; CHECK-NEXT: smlal v20.4s, v18.4h, v3.4h +; CHECK-NEXT: smlal v21.4s, v1.4h, v19.4h +; CHECK-NEXT: add v0.4s, v6.4s, v16.4s +; CHECK-NEXT: add v1.4s, v7.4s, v5.4s +; CHECK-NEXT: add v2.4s, v2.4s, v4.4s +; CHECK-NEXT: add v3.4s, v21.4s, v20.4s +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v1.4s, v3.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 @@ -2042,154 +2023,152 @@ define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> % ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: ldr b0, [sp, #80] ; CHECK-NEXT: add x8, sp, #88 -; CHECK-NEXT: ldr b3, [sp, #144] -; CHECK-NEXT: add x10, sp, #96 -; CHECK-NEXT: ldr b5, [sp, #16] -; CHECK-NEXT: add x9, sp, #24 +; CHECK-NEXT: ldr b2, [sp, #144] +; CHECK-NEXT: add x9, sp, #152 +; CHECK-NEXT: fmov s3, w0 +; CHECK-NEXT: ldr b4, [sp, #16] ; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #152 -; CHECK-NEXT: fmov s2, w0 -; CHECK-NEXT: ldr b1, [sp, #208] -; CHECK-NEXT: ld1 { v5.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #32 -; CHECK-NEXT: ld1 { v3.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #160 -; CHECK-NEXT: ld1 { v0.b }[2], [x10] +; CHECK-NEXT: add x8, sp, #96 ; CHECK-NEXT: add x10, sp, #104 -; CHECK-NEXT: mov v2.b[1], w1 -; CHECK-NEXT: ldr b17, [sp, #672] -; CHECK-NEXT: ld1 { v5.b }[2], [x9] -; CHECK-NEXT: add x9, sp, #40 -; CHECK-NEXT: ld1 { v3.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #168 +; CHECK-NEXT: ld1 { v2.b }[1], [x9] +; CHECK-NEXT: mov v3.b[1], w1 +; CHECK-NEXT: add x9, sp, #160 +; CHECK-NEXT: add x11, sp, #128 +; CHECK-NEXT: ldr b1, [sp, #208] +; CHECK-NEXT: ld1 { v0.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #24 +; CHECK-NEXT: ld1 { v2.b }[2], [x9] +; CHECK-NEXT: add x9, sp, #168 +; CHECK-NEXT: mov v3.b[2], w2 +; CHECK-NEXT: ld1 { v4.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #112 ; CHECK-NEXT: ld1 { v0.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #112 -; CHECK-NEXT: mov v2.b[2], w2 -; CHECK-NEXT: add x11, sp, #648 -; CHECK-NEXT: ld1 { v5.b }[3], [x9] -; CHECK-NEXT: add x9, sp, #48 -; CHECK-NEXT: ld1 { v3.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #176 -; CHECK-NEXT: ld1 { v0.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #32 +; CHECK-NEXT: ld1 { v2.b }[3], [x9] +; CHECK-NEXT: add x9, sp, #176 +; CHECK-NEXT: mov v3.b[3], w3 +; CHECK-NEXT: ld1 { v4.b }[2], [x10] ; CHECK-NEXT: add x10, sp, #120 -; CHECK-NEXT: mov v2.b[3], w3 -; CHECK-NEXT: ld1 { v5.b }[4], [x9] -; CHECK-NEXT: add x9, sp, #56 -; CHECK-NEXT: ld1 { v3.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #184 +; CHECK-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #40 +; CHECK-NEXT: ld1 { v2.b }[4], [x9] +; CHECK-NEXT: add x9, sp, #184 +; CHECK-NEXT: mov v3.b[4], w4 +; CHECK-NEXT: ld1 { v4.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #48 ; CHECK-NEXT: ld1 { v0.b }[5], [x10] -; CHECK-NEXT: add x10, sp, #128 -; CHECK-NEXT: mov v2.b[4], w4 -; CHECK-NEXT: ld1 { v5.b }[5], [x9] -; CHECK-NEXT: add x9, sp, #64 -; CHECK-NEXT: ld1 { v3.b }[5], [x8] -; CHECK-NEXT: add x8, sp, #192 -; CHECK-NEXT: ld1 { v0.b }[6], [x10] ; CHECK-NEXT: add x10, sp, #136 -; CHECK-NEXT: sshll v4.8h, v1.8b, #0 -; CHECK-NEXT: mov v2.b[5], w5 -; CHECK-NEXT: ld1 { v5.b }[6], [x9] -; CHECK-NEXT: ld1 { v3.b }[6], [x8] -; CHECK-NEXT: add x8, sp, #200 +; CHECK-NEXT: ld1 { v2.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #192 +; CHECK-NEXT: mov v3.b[5], w5 +; CHECK-NEXT: ld1 { v4.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #56 +; CHECK-NEXT: ld1 { v0.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #632 +; CHECK-NEXT: ld1 { v2.b }[6], [x9] +; CHECK-NEXT: add x9, sp, #200 +; CHECK-NEXT: mov v3.b[6], w6 +; CHECK-NEXT: ld1 { v4.b }[5], [x8] +; CHECK-NEXT: add x8, sp, #64 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 ; CHECK-NEXT: ld1 { v0.b }[7], [x10] -; CHECK-NEXT: add x9, sp, #72 +; CHECK-NEXT: ld1 { v2.b }[7], [x9] +; CHECK-NEXT: add x9, sp, #552 +; CHECK-NEXT: mov v3.b[7], w7 ; CHECK-NEXT: add x10, sp, #680 -; CHECK-NEXT: mov v2.b[6], w6 -; CHECK-NEXT: ld1 { v3.b }[7], [x8] +; CHECK-NEXT: ld1 { v4.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #72 +; CHECK-NEXT: movi v6.2d, #0000000000000000 +; CHECK-NEXT: sshll v5.4s, v1.4h, #0 +; CHECK-NEXT: ldr b1, [sp, #608] +; CHECK-NEXT: sshll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v4.b }[7], [x8] ; CHECK-NEXT: add x8, sp, #616 -; CHECK-NEXT: sshll v1.8h, v0.8b, #0 -; CHECK-NEXT: ldr b0, [sp, #608] -; CHECK-NEXT: ld1 { v5.b }[7], [x9] -; CHECK-NEXT: add x9, sp, #488 -; CHECK-NEXT: mov v2.b[7], w7 -; CHECK-NEXT: ld1 { v17.b }[1], [x10] -; CHECK-NEXT: ld1 { v0.b }[1], [x8] -; CHECK-NEXT: add x8, sp, #624 +; CHECK-NEXT: sshll v2.8h, v2.8b, #0 ; CHECK-NEXT: sshll v7.8h, v3.8b, #0 -; CHECK-NEXT: add x10, sp, #504 -; CHECK-NEXT: sshll v16.8h, v5.8b, #0 -; CHECK-NEXT: ldr b5, [sp, #480] -; CHECK-NEXT: sshll v6.8h, v2.8b, #0 -; CHECK-NEXT: ld1 { v0.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #632 -; CHECK-NEXT: saddl2 v2.4s, v16.8h, v7.8h -; CHECK-NEXT: ld1 { v5.b }[1], [x9] -; CHECK-NEXT: saddl v7.4s, v16.4h, v7.4h -; CHECK-NEXT: ldr b16, [sp, #544] -; CHECK-NEXT: add x9, sp, #552 -; CHECK-NEXT: ld1 { v0.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #640 -; CHECK-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-NEXT: ld1 { v16.b }[1], [x9] -; CHECK-NEXT: add x9, sp, #560 -; CHECK-NEXT: mov v4.s[1], wzr -; CHECK-NEXT: ld1 { v0.b }[4], [x8] +; CHECK-NEXT: ld1 { v1.b }[1], [x8] +; CHECK-NEXT: add x8, sp, #624 +; CHECK-NEXT: sshll v3.8h, v4.8b, #0 +; CHECK-NEXT: mov v6.s[0], v5.s[0] +; CHECK-NEXT: saddl2 v5.4s, v3.8h, v2.8h +; CHECK-NEXT: saddl2 v16.4s, v7.8h, v0.8h +; CHECK-NEXT: ld1 { v1.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #488 +; CHECK-NEXT: saddw v4.4s, v6.4s, v7.4h +; CHECK-NEXT: ldr b6, [sp, #480] +; CHECK-NEXT: add v5.4s, v16.4s, v5.4s +; CHECK-NEXT: ldr b7, [sp, #544] +; CHECK-NEXT: ldr b16, [sp, #672] +; CHECK-NEXT: ld1 { v6.b }[1], [x8] ; CHECK-NEXT: add x8, sp, #496 -; CHECK-NEXT: saddl2 v3.4s, v6.8h, v1.8h -; CHECK-NEXT: ld1 { v16.b }[2], [x9] +; CHECK-NEXT: ld1 { v7.b }[1], [x9] +; CHECK-NEXT: add x9, sp, #560 +; CHECK-NEXT: ld1 { v16.b }[1], [x10] +; CHECK-NEXT: add x10, sp, #688 +; CHECK-NEXT: ld1 { v1.b }[3], [x11] +; CHECK-NEXT: add x11, sp, #640 +; CHECK-NEXT: ld1 { v6.b }[2], [x8] +; CHECK-NEXT: add x8, sp, #504 +; CHECK-NEXT: ld1 { v7.b }[2], [x9] ; CHECK-NEXT: add x9, sp, #568 -; CHECK-NEXT: ld1 { v5.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #688 -; CHECK-NEXT: mov v4.s[2], wzr -; CHECK-NEXT: ld1 { v0.b }[5], [x11] -; CHECK-NEXT: add x11, sp, #656 -; CHECK-NEXT: ld1 { v17.b }[2], [x8] -; CHECK-NEXT: add x8, sp, #696 -; CHECK-NEXT: ld1 { v5.b }[3], [x10] -; CHECK-NEXT: add x10, sp, #512 -; CHECK-NEXT: mov v4.s[3], wzr -; CHECK-NEXT: ld1 { v16.b }[3], [x9] +; CHECK-NEXT: ld1 { v16.b }[2], [x10] +; CHECK-NEXT: add x10, sp, #696 +; CHECK-NEXT: ld1 { v1.b }[4], [x11] +; CHECK-NEXT: add x11, sp, #648 +; CHECK-NEXT: ld1 { v6.b }[3], [x8] +; CHECK-NEXT: add x8, sp, #512 +; CHECK-NEXT: ld1 { v7.b }[3], [x9] ; CHECK-NEXT: add x9, sp, #576 -; CHECK-NEXT: ld1 { v0.b }[6], [x11] -; CHECK-NEXT: ld1 { v17.b }[3], [x8] -; CHECK-NEXT: add x8, sp, #704 -; CHECK-NEXT: ld1 { v5.b }[4], [x10] -; CHECK-NEXT: add x11, sp, #520 -; CHECK-NEXT: saddw v4.4s, v4.4s, v6.4h -; CHECK-NEXT: ldr b6, [sp, #736] -; CHECK-NEXT: ld1 { v16.b }[4], [x9] +; CHECK-NEXT: ld1 { v16.b }[3], [x10] +; CHECK-NEXT: add x10, sp, #704 +; CHECK-NEXT: ld1 { v1.b }[5], [x11] +; CHECK-NEXT: add x11, sp, #656 +; CHECK-NEXT: ld1 { v6.b }[4], [x8] +; CHECK-NEXT: add x8, sp, #520 +; CHECK-NEXT: ld1 { v7.b }[4], [x9] ; CHECK-NEXT: add x9, sp, #584 -; CHECK-NEXT: ld1 { v17.b }[4], [x8] -; CHECK-NEXT: add x8, sp, #712 -; CHECK-NEXT: add x10, sp, #664 -; CHECK-NEXT: ld1 { v5.b }[5], [x11] -; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h -; CHECK-NEXT: sshll v4.8h, v6.8b, #0 -; CHECK-NEXT: ld1 { v16.b }[5], [x9] -; CHECK-NEXT: ld1 { v17.b }[5], [x8] -; CHECK-NEXT: add x9, sp, #592 -; CHECK-NEXT: ld1 { v0.b }[7], [x10] +; CHECK-NEXT: ld1 { v16.b }[4], [x10] +; CHECK-NEXT: add x10, sp, #712 +; CHECK-NEXT: ld1 { v1.b }[6], [x11] +; CHECK-NEXT: add x11, sp, #664 +; CHECK-NEXT: ld1 { v6.b }[5], [x8] ; CHECK-NEXT: add x8, sp, #528 +; CHECK-NEXT: ld1 { v7.b }[5], [x9] +; CHECK-NEXT: add x9, sp, #592 +; CHECK-NEXT: ld1 { v16.b }[5], [x10] ; CHECK-NEXT: add x10, sp, #720 -; CHECK-NEXT: sshll v4.4s, v4.4h, #0 -; CHECK-NEXT: ld1 { v16.b }[6], [x9] -; CHECK-NEXT: ld1 { v5.b }[6], [x8] -; CHECK-NEXT: add x9, sp, #600 -; CHECK-NEXT: mov v4.s[1], wzr -; CHECK-NEXT: ld1 { v17.b }[6], [x10] -; CHECK-NEXT: add x10, sp, #728 -; CHECK-NEXT: add x8, sp, #536 -; CHECK-NEXT: ld1 { v16.b }[7], [x9] -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v4.s[2], wzr -; CHECK-NEXT: ld1 { v17.b }[7], [x10] -; CHECK-NEXT: ld1 { v5.b }[7], [x8] -; CHECK-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-NEXT: sshll v3.8h, v16.8b, #0 -; CHECK-NEXT: mov v4.s[3], wzr -; CHECK-NEXT: sshll v6.8h, v17.8b, #0 -; CHECK-NEXT: sshll v5.8h, v5.8b, #0 -; CHECK-NEXT: add v1.4s, v1.4s, v7.4s -; CHECK-NEXT: saddl2 v7.4s, v6.8h, v3.8h -; CHECK-NEXT: saddl2 v16.4s, v0.8h, v5.8h -; CHECK-NEXT: saddw v4.4s, v4.4s, v5.4h -; CHECK-NEXT: saddl v3.4s, v6.4h, v3.4h -; CHECK-NEXT: add v5.4s, v16.4s, v7.4s +; CHECK-NEXT: saddl v2.4s, v3.4h, v2.4h +; CHECK-NEXT: ldr b3, [sp, #736] +; CHECK-NEXT: ld1 { v6.b }[6], [x8] +; CHECK-NEXT: add x8, sp, #600 ; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h -; CHECK-NEXT: add v3.4s, v3.4s, v5.4s +; CHECK-NEXT: ld1 { v7.b }[6], [x9] +; CHECK-NEXT: ld1 { v16.b }[6], [x10] +; CHECK-NEXT: add x9, sp, #728 +; CHECK-NEXT: add x10, sp, #536 +; CHECK-NEXT: ld1 { v1.b }[7], [x11] +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ld1 { v7.b }[7], [x8] +; CHECK-NEXT: sshll v2.8h, v3.8b, #0 +; CHECK-NEXT: ld1 { v16.b }[7], [x9] +; CHECK-NEXT: ld1 { v6.b }[7], [x10] +; CHECK-NEXT: sshll v2.4s, v2.4h, #0 +; CHECK-NEXT: sshll v1.8h, v1.8b, #0 +; CHECK-NEXT: mov v4.s[0], v2.s[0] +; CHECK-NEXT: sshll v3.8h, v7.8b, #0 +; CHECK-NEXT: sshll v7.8h, v16.8b, #0 +; CHECK-NEXT: sshll v2.8h, v6.8b, #0 +; CHECK-NEXT: saddl2 v6.4s, v7.8h, v3.8h +; CHECK-NEXT: saddl2 v16.4s, v1.8h, v2.8h +; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h +; CHECK-NEXT: saddl v3.4s, v7.4h, v3.4h +; CHECK-NEXT: add v4.4s, v16.4s, v6.4s +; CHECK-NEXT: saddw v1.4s, v2.4s, v1.4h +; CHECK-NEXT: add v2.4s, v3.4s, v4.4s +; CHECK-NEXT: add v0.4s, v0.4s, v5.4s ; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: add v0.4s, v0.4s, v3.4s -; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-NEXT: addv s0, v0.4s ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll index 94aea27..4d847fb 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add-legalization.ll @@ -99,13 +99,9 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind { define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.b[9], wzr -; CHECK-NEXT: mov v0.b[10], wzr -; CHECK-NEXT: mov v0.b[11], wzr -; CHECK-NEXT: mov v0.b[12], wzr -; CHECK-NEXT: mov v0.b[13], wzr -; CHECK-NEXT: mov v0.b[14], wzr -; CHECK-NEXT: mov v0.b[15], wzr +; CHECK-NEXT: adrp x8, .LCPI8_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: addv b0, v0.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll index b80d6f9..a14f12a 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -114,13 +114,9 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind { define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.b[9], wzr -; CHECK-NEXT: mov v0.b[10], wzr -; CHECK-NEXT: mov v0.b[11], wzr -; CHECK-NEXT: mov v0.b[12], wzr -; CHECK-NEXT: mov v0.b[13], wzr -; CHECK-NEXT: mov v0.b[14], wzr -; CHECK-NEXT: mov v0.b[15], wzr +; CHECK-NEXT: adrp x8, .LCPI9_0 +; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI9_0] +; CHECK-NEXT: and v0.16b, v0.16b, v1.16b ; CHECK-NEXT: umaxv b0, v0.16b ; CHECK-NEXT: fmov w0, s0 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll index 45d74ea..ef21006 100644 --- a/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll +++ b/llvm/test/CodeGen/X86/clear_upper_vector_element_bits.ll @@ -542,7 +542,7 @@ define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind { define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind { ; SSE2-LABEL: _clearupper4xi64b: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [NaN,0.0E+0,NaN,0.0E+0] +; SSE2-NEXT: movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0] ; SSE2-NEXT: andps %xmm2, %xmm0 ; SSE2-NEXT: andps %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -689,165 +689,14 @@ define <16 x i16> @_clearupper16xi16b(<16 x i16>) nounwind { } define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { -; SSE2-LABEL: _clearupper16xi8b: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm1, %rdx -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $56, %rax -; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movq %rdx, %rcx -; SSE2-NEXT: shrq $48, %rcx -; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: shrq $40, %rsi -; SSE2-NEXT: andl $15, %esi -; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: shrq $32, %r8 -; SSE2-NEXT: andl $15, %r8d -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %r10, %rdi -; SSE2-NEXT: shrq $56, %rdi -; SSE2-NEXT: andl $15, %edi -; SSE2-NEXT: movq %r10, %r9 -; SSE2-NEXT: shrq $48, %r9 -; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movq %r10, %r11 -; SSE2-NEXT: shrq $40, %r11 -; SSE2-NEXT: andl $15, %r11d -; SSE2-NEXT: movq %r10, %rbx -; SSE2-NEXT: shrq $32, %rbx -; SSE2-NEXT: andl $15, %ebx -; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE2-NEXT: orq %rbx, %r10 -; SSE2-NEXT: shlq $40, %r11 -; SSE2-NEXT: orq %r10, %r11 -; SSE2-NEXT: shlq $48, %r9 -; SSE2-NEXT: orq %r11, %r9 -; SSE2-NEXT: shlq $56, %rdi -; SSE2-NEXT: orq %r9, %rdi -; SSE2-NEXT: shlq $32, %r8 -; SSE2-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; SSE2-NEXT: orq %r8, %rdx -; SSE2-NEXT: shlq $40, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: shlq $48, %rcx -; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: shlq $56, %rax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: retq -; -; SSE42-LABEL: _clearupper16xi8b: -; SSE42: # %bb.0: -; SSE42-NEXT: pushq %rbx -; SSE42-NEXT: pextrq $1, %xmm0, %rdx -; SSE42-NEXT: movq %rdx, %rax -; SSE42-NEXT: shrq $56, %rax -; SSE42-NEXT: andl $15, %eax -; SSE42-NEXT: movq %rdx, %rcx -; SSE42-NEXT: shrq $48, %rcx -; SSE42-NEXT: andl $15, %ecx -; SSE42-NEXT: movq %rdx, %rsi -; SSE42-NEXT: shrq $40, %rsi -; SSE42-NEXT: andl $15, %esi -; SSE42-NEXT: movq %rdx, %r8 -; SSE42-NEXT: shrq $32, %r8 -; SSE42-NEXT: andl $15, %r8d -; SSE42-NEXT: movq %xmm0, %r10 -; SSE42-NEXT: movq %r10, %rdi -; SSE42-NEXT: shrq $56, %rdi -; SSE42-NEXT: andl $15, %edi -; SSE42-NEXT: movq %r10, %r9 -; SSE42-NEXT: shrq $48, %r9 -; SSE42-NEXT: andl $15, %r9d -; SSE42-NEXT: movq %r10, %r11 -; SSE42-NEXT: shrq $40, %r11 -; SSE42-NEXT: andl $15, %r11d -; SSE42-NEXT: movq %r10, %rbx -; SSE42-NEXT: shrq $32, %rbx -; SSE42-NEXT: andl $15, %ebx -; SSE42-NEXT: shlq $32, %rbx -; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE42-NEXT: orq %rbx, %r10 -; SSE42-NEXT: shlq $40, %r11 -; SSE42-NEXT: orq %r10, %r11 -; SSE42-NEXT: shlq $48, %r9 -; SSE42-NEXT: orq %r11, %r9 -; SSE42-NEXT: shlq $56, %rdi -; SSE42-NEXT: orq %r9, %rdi -; SSE42-NEXT: shlq $32, %r8 -; SSE42-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; SSE42-NEXT: orq %r8, %rdx -; SSE42-NEXT: shlq $40, %rsi -; SSE42-NEXT: orq %rdx, %rsi -; SSE42-NEXT: shlq $48, %rcx -; SSE42-NEXT: orq %rsi, %rcx -; SSE42-NEXT: shlq $56, %rax -; SSE42-NEXT: orq %rcx, %rax -; SSE42-NEXT: movq %rax, %xmm1 -; SSE42-NEXT: movq %rdi, %xmm0 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: popq %rbx -; SSE42-NEXT: retq +; SSE-LABEL: _clearupper16xi8b: +; SSE: # %bb.0: +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper16xi8b: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shrq $40, %rdx -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: shrq $32, %rsi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movq %rdi, %r9 -; AVX-NEXT: shrq $48, %r9 -; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: movq %rdi, %r10 -; AVX-NEXT: shrq $40, %r10 -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: movq %rdi, %r11 -; AVX-NEXT: shrq $32, %r11 -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: shrq $56, %r8 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: shrq $56, %rbx -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: shlq $32, %r11 -; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F -; AVX-NEXT: orq %r11, %rdi -; AVX-NEXT: shlq $40, %r10 -; AVX-NEXT: orq %rdi, %r10 -; AVX-NEXT: shlq $48, %r9 -; AVX-NEXT: orq %r10, %r9 -; AVX-NEXT: shlq $56, %rbx -; AVX-NEXT: orq %r9, %rbx -; AVX-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; AVX-NEXT: shlq $32, %rsi -; AVX-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NEXT: orq %rsi, %rcx -; AVX-NEXT: shlq $40, %rdx -; AVX-NEXT: orq %rcx, %rdx -; AVX-NEXT: shlq $48, %rax -; AVX-NEXT: orq %rdx, %rax -; AVX-NEXT: shlq $56, %r8 -; AVX-NEXT: orq %rax, %r8 -; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0 -; AVX-NEXT: popq %rbx +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %x4 = bitcast <16 x i8> %0 to <32 x i4> %r0 = insertelement <32 x i4> %x4, i4 zeroinitializer, i32 1 @@ -871,165 +720,14 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind { } define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind { -; SSE2-LABEL: _clearupper32xi8b: -; SSE2: # %bb.0: -; SSE2-NEXT: pushq %rbx -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] -; SSE2-NEXT: movq %xmm2, %rdx -; SSE2-NEXT: movq %rdx, %rax -; SSE2-NEXT: shrq $56, %rax -; SSE2-NEXT: andl $15, %eax -; SSE2-NEXT: movq %rdx, %rcx -; SSE2-NEXT: shrq $48, %rcx -; SSE2-NEXT: andl $15, %ecx -; SSE2-NEXT: movq %rdx, %rsi -; SSE2-NEXT: shrq $40, %rsi -; SSE2-NEXT: andl $15, %esi -; SSE2-NEXT: movq %rdx, %r8 -; SSE2-NEXT: shrq $32, %r8 -; SSE2-NEXT: andl $15, %r8d -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %r10, %rdi -; SSE2-NEXT: shrq $56, %rdi -; SSE2-NEXT: andl $15, %edi -; SSE2-NEXT: movq %r10, %r9 -; SSE2-NEXT: shrq $48, %r9 -; SSE2-NEXT: andl $15, %r9d -; SSE2-NEXT: movq %r10, %r11 -; SSE2-NEXT: shrq $40, %r11 -; SSE2-NEXT: andl $15, %r11d -; SSE2-NEXT: movq %r10, %rbx -; SSE2-NEXT: shrq $32, %rbx -; SSE2-NEXT: andl $15, %ebx -; SSE2-NEXT: shlq $32, %rbx -; SSE2-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE2-NEXT: orq %rbx, %r10 -; SSE2-NEXT: shlq $40, %r11 -; SSE2-NEXT: orq %r10, %r11 -; SSE2-NEXT: shlq $48, %r9 -; SSE2-NEXT: orq %r11, %r9 -; SSE2-NEXT: shlq $56, %rdi -; SSE2-NEXT: orq %r9, %rdi -; SSE2-NEXT: shlq $32, %r8 -; SSE2-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; SSE2-NEXT: orq %r8, %rdx -; SSE2-NEXT: shlq $40, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: shlq $48, %rcx -; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: shlq $56, %rax -; SSE2-NEXT: orq %rcx, %rax -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: movq %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: popq %rbx -; SSE2-NEXT: retq -; -; SSE42-LABEL: _clearupper32xi8b: -; SSE42: # %bb.0: -; SSE42-NEXT: pushq %rbx -; SSE42-NEXT: pextrq $1, %xmm0, %rdx -; SSE42-NEXT: movq %rdx, %rax -; SSE42-NEXT: shrq $56, %rax -; SSE42-NEXT: andl $15, %eax -; SSE42-NEXT: movq %rdx, %rcx -; SSE42-NEXT: shrq $48, %rcx -; SSE42-NEXT: andl $15, %ecx -; SSE42-NEXT: movq %rdx, %rsi -; SSE42-NEXT: shrq $40, %rsi -; SSE42-NEXT: andl $15, %esi -; SSE42-NEXT: movq %rdx, %r8 -; SSE42-NEXT: shrq $32, %r8 -; SSE42-NEXT: andl $15, %r8d -; SSE42-NEXT: movq %xmm0, %r10 -; SSE42-NEXT: movq %r10, %rdi -; SSE42-NEXT: shrq $56, %rdi -; SSE42-NEXT: andl $15, %edi -; SSE42-NEXT: movq %r10, %r9 -; SSE42-NEXT: shrq $48, %r9 -; SSE42-NEXT: andl $15, %r9d -; SSE42-NEXT: movq %r10, %r11 -; SSE42-NEXT: shrq $40, %r11 -; SSE42-NEXT: andl $15, %r11d -; SSE42-NEXT: movq %r10, %rbx -; SSE42-NEXT: shrq $32, %rbx -; SSE42-NEXT: andl $15, %ebx -; SSE42-NEXT: shlq $32, %rbx -; SSE42-NEXT: andl $252645135, %r10d # imm = 0xF0F0F0F -; SSE42-NEXT: orq %rbx, %r10 -; SSE42-NEXT: shlq $40, %r11 -; SSE42-NEXT: orq %r10, %r11 -; SSE42-NEXT: shlq $48, %r9 -; SSE42-NEXT: orq %r11, %r9 -; SSE42-NEXT: shlq $56, %rdi -; SSE42-NEXT: orq %r9, %rdi -; SSE42-NEXT: shlq $32, %r8 -; SSE42-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F -; SSE42-NEXT: orq %r8, %rdx -; SSE42-NEXT: shlq $40, %rsi -; SSE42-NEXT: orq %rdx, %rsi -; SSE42-NEXT: shlq $48, %rcx -; SSE42-NEXT: orq %rsi, %rcx -; SSE42-NEXT: shlq $56, %rax -; SSE42-NEXT: orq %rcx, %rax -; SSE42-NEXT: movq %rax, %xmm2 -; SSE42-NEXT: movq %rdi, %xmm0 -; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE42-NEXT: popq %rbx -; SSE42-NEXT: retq +; SSE-LABEL: _clearupper32xi8b: +; SSE: # %bb.0: +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: _clearupper32xi8b: ; AVX: # %bb.0: -; AVX-NEXT: pushq %rbx -; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx -; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdi -; AVX-NEXT: movq %rcx, %rax -; AVX-NEXT: shrq $48, %rax -; AVX-NEXT: andl $15, %eax -; AVX-NEXT: movq %rcx, %rdx -; AVX-NEXT: shrq $40, %rdx -; AVX-NEXT: andl $15, %edx -; AVX-NEXT: movq %rcx, %rsi -; AVX-NEXT: shrq $32, %rsi -; AVX-NEXT: andl $15, %esi -; AVX-NEXT: movq %rdi, %r9 -; AVX-NEXT: shrq $48, %r9 -; AVX-NEXT: andl $15, %r9d -; AVX-NEXT: movq %rdi, %r10 -; AVX-NEXT: shrq $40, %r10 -; AVX-NEXT: andl $15, %r10d -; AVX-NEXT: movq %rdi, %r11 -; AVX-NEXT: shrq $32, %r11 -; AVX-NEXT: andl $15, %r11d -; AVX-NEXT: movq %rcx, %r8 -; AVX-NEXT: shrq $56, %r8 -; AVX-NEXT: andl $15, %r8d -; AVX-NEXT: movq %rdi, %rbx -; AVX-NEXT: shrq $56, %rbx -; AVX-NEXT: andl $15, %ebx -; AVX-NEXT: shlq $32, %r11 -; AVX-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F -; AVX-NEXT: orq %r11, %rdi -; AVX-NEXT: shlq $40, %r10 -; AVX-NEXT: orq %rdi, %r10 -; AVX-NEXT: shlq $48, %r9 -; AVX-NEXT: orq %r10, %r9 -; AVX-NEXT: shlq $56, %rbx -; AVX-NEXT: orq %r9, %rbx -; AVX-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) -; AVX-NEXT: shlq $32, %rsi -; AVX-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F -; AVX-NEXT: orq %rsi, %rcx -; AVX-NEXT: shlq $40, %rdx -; AVX-NEXT: orq %rcx, %rdx -; AVX-NEXT: shlq $48, %rax -; AVX-NEXT: orq %rdx, %rax -; AVX-NEXT: shlq $56, %r8 -; AVX-NEXT: orq %rax, %r8 -; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; AVX-NEXT: vinsertf128 $0, -{{[0-9]+}}(%rsp), %ymm0, %ymm0 -; AVX-NEXT: popq %rbx +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; AVX-NEXT: retq %x4 = bitcast <32 x i8> %0 to <64 x i4> %r0 = insertelement <64 x i4> %x4, i4 zeroinitializer, i32 1 diff --git a/llvm/test/CodeGen/X86/insertelement-zero.ll b/llvm/test/CodeGen/X86/insertelement-zero.ll index 6a21746..952940d 100644 --- a/llvm/test/CodeGen/X86/insertelement-zero.ll +++ b/llvm/test/CodeGen/X86/insertelement-zero.ll @@ -4,9 +4,9 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2 define <2 x double> @insert_v2f64_z1(<2 x double> %a) { ; SSE2-LABEL: insert_v2f64_z1: @@ -268,29 +268,20 @@ define <4 x i32> @insert_v4i32_01z3(<4 x i32> %a) { define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) { ; SSE2-LABEL: insert_v8i32_z12345z7: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v8i32_z12345z7: ; SSE3: # %bb.0: -; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v8i32_z12345z7: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v8i32_z12345z7: @@ -313,23 +304,17 @@ define <8 x i32> @insert_v8i32_z12345z7(<8 x i32> %a) { define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) { ; SSE2-LABEL: insert_v8i16_z12345z7: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v8i16_z12345z7: ; SSE3: # %bb.0: -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v8i16_z12345z7: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v8i16_z12345z7: @@ -351,26 +336,20 @@ define <8 x i16> @insert_v8i16_z12345z7(<8 x i16> %a) { define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) { ; SSE2-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE2: # %bb.0: -; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 -; SSE2-NEXT: pinsrw $7, %eax, %xmm1 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE2-NEXT: retq ; ; SSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSE3: # %bb.0: -; SSE3-NEXT: xorl %eax, %eax -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSE3-NEXT: pinsrw $7, %eax, %xmm1 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: insert_v16i16_z12345z789ABCDEz: ; SSSE3: # %bb.0: -; SSSE3-NEXT: xorl %eax, %eax -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $7, %eax, %xmm1 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: insert_v16i16_z12345z789ABCDEz: @@ -391,46 +370,15 @@ define <16 x i16> @insert_v16i16_z12345z789ABCDEz(<16 x i16> %a) { } define <16 x i8> @insert_v16i8_z123456789ABCDEz(<16 x i8> %a) { -; SSE2-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE2: # %bb.0: -; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE3: # %bb.0: -; SSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_v16i8_z123456789ABCDEz: -; SSSE3: # %bb.0: -; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_v16i8_z123456789ABCDEz: -; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX1: # %bb.0: -; AVX1-NEXT: xorl %eax, %eax -; AVX1-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX1-NEXT: retq +; SSE-LABEL: insert_v16i8_z123456789ABCDEz: +; SSE: # %bb.0: +; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; -; AVX2-SLOW-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: xorl %eax, %eax -; AVX2-SLOW-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: insert_v16i8_z123456789ABCDEz: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX2-FAST-NEXT: retq +; AVX-LABEL: insert_v16i8_z123456789ABCDEz: +; AVX: # %bb.0: +; AVX-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX-NEXT: retq %1 = insertelement <16 x i8> %a, i8 0, i32 0 %2 = insertelement <16 x i8> %1, i8 0, i32 15 ret <16 x i8> %2 @@ -457,11 +405,9 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) { ; ; SSE41-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: ; SSE41: # %bb.0: -; SSE41-NEXT: xorl %eax, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 ; SSE41-NEXT: pxor %xmm2, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] +; SSE41-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE41-NEXT: retq ; ; AVX-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz: -- 2.7.4