From 09f4cedd6189a2ab9464b777ecc8e10610a7ff2c Mon Sep 17 00:00:00 2001 From: David Green Date: Fri, 30 Jun 2023 11:21:26 +0100 Subject: [PATCH] [AArch64] Codegen tests for fold from D153972. NFC --- llvm/test/CodeGen/AArch64/extbinopload.ll | 1355 +++++++++++++++++++++++++++++ 1 file changed, 1355 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/extbinopload.ll diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll new file mode 100644 index 0000000..3465cb9 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/extbinopload.ll @@ -0,0 +1,1355 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s + +define <4 x i16> @normal_load_v4i8(ptr %p) { +; CHECK-LABEL: normal_load_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: add v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %l1 = load <4 x i8>, ptr %p + %q = getelementptr i8, ptr %p, i32 4 + %l2 = load <4 x i8>, ptr %q + %e1 = zext <4 x i8> %l1 to <4 x i16> + %e2 = zext <4 x i8> %l2 to <4 x i16> + %a = add <4 x i16> %e1, %e2 + ret <4 x i16> %a +} + +define <4 x i32> @normal_load_v4i16_v4i32(ptr %p) { +; CHECK-LABEL: normal_load_v4i16_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: uaddl v0.4s, v0.4h, v1.4h +; CHECK-NEXT: ret + %l1 = load <4 x i16>, ptr %p + %q = getelementptr i8, ptr %p, i32 8 + %l2 = load <4 x i16>, ptr %q + %e1 = zext <4 x i16> %l1 to <4 x i32> + %e2 = zext <4 x i16> %l2 to <4 x i32> + %a = add <4 x i32> %e1, %e2 + ret <4 x i32> %a +} + +define <4 x i16> @load_v4i8(ptr %p) { +; CHECK-LABEL: load_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s1, s0, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: shl v0.4h, v0.4h, #3 +; CHECK-NEXT: add v0.4h, v1.4h, v0.4h +; CHECK-NEXT: ret + %l1 = load <4 x i8>, ptr %p + %q = getelementptr i8, ptr %p, i32 4 + %l2 = load <4 x i8>, ptr %q + %e1 = zext <4 x i8> %l1 to <4 x i16> + %e2 = zext <4 x i8> %l2 to <4 x i16> + %e3 = shl <4 x i16> %e2, + %a = add <4 x i16> %e1, %e3 + ret <4 x i16> %a +} + +define <4 x i32> @load_v4i16_v4i32(ptr %p) { +; CHECK-LABEL: load_v4i16_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d1, d0, [x0] +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ret + %l1 = load <4 x i16>, ptr %p + %q = getelementptr i8, ptr %p, i32 8 + %l2 = load <4 x i16>, ptr %q + %e1 = zext <4 x i16> %l1 to <4 x i32> + %e2 = zext <4 x i16> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <4 x i64> @load_v4i32_v4i64(ptr %p) { +; CHECK-LABEL: load_v4i32_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ushll2 v1.2d, v0.4s, #3 +; CHECK-NEXT: ushll v0.2d, v0.2s, #3 +; CHECK-NEXT: uaddw2 v1.2d, v1.2d, v2.4s +; CHECK-NEXT: uaddw v0.2d, v0.2d, v2.2s +; CHECK-NEXT: ret + %l1 = load <4 x i32>, ptr %p + %q = getelementptr i8, ptr %p, i32 16 + %l2 = load <4 x i32>, ptr %q + %e1 = zext <4 x i32> %l1 to <4 x i64> + %e2 = zext <4 x i32> %l2 to <4 x i64> + %e3 = shl <4 x i64> %e2, + %a = add <4 x i64> %e1, %e3 + ret <4 x i64> %a +} + +define <4 x i32> @load_v4i8_v4i32(ptr %p) { +; CHECK-LABEL: load_v4i8_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s1, s0, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h +; CHECK-NEXT: ret + %l1 = load <4 x i8>, ptr %p + %q = getelementptr i8, ptr %p, i32 4 + %l2 = load <4 x i8>, ptr %q + %e1 = zext <4 x i8> %l1 to <4 x i32> + %e2 = zext <4 x i8> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <4 x i32> @load_v4i12_v4i32(ptr %p) { +; CHECK-LABEL: load_v4i12_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldur w8, [x0, #6] +; CHECK-NEXT: ldr w9, [x0] +; CHECK-NEXT: ldrh w12, [x0, #10] +; CHECK-NEXT: and w10, w8, #0xfff +; CHECK-NEXT: ldrh w13, [x0, #4] +; CHECK-NEXT: and w11, w9, #0xfff +; CHECK-NEXT: fmov s0, w10 +; CHECK-NEXT: ubfx w10, w8, #12, #12 +; CHECK-NEXT: fmov s1, w11 +; CHECK-NEXT: ubfx w11, w9, #12, #12 +; CHECK-NEXT: orr x8, x8, x12, lsl #32 +; CHECK-NEXT: orr x9, x9, x13, lsl #32 +; CHECK-NEXT: mov v0.s[1], w10 +; CHECK-NEXT: ubfx x8, x8, #24, #12 +; CHECK-NEXT: mov v1.s[1], w11 +; CHECK-NEXT: ubfx x9, x9, #24, #12 +; CHECK-NEXT: mov v0.s[2], w8 +; CHECK-NEXT: ubfx w8, w12, #4, #12 +; CHECK-NEXT: mov v1.s[2], w9 +; CHECK-NEXT: ubfx w9, w13, #4, #12 +; CHECK-NEXT: mov v0.s[3], w8 +; CHECK-NEXT: mov v1.s[3], w9 +; CHECK-NEXT: shl v0.4s, v0.4s, #3 +; CHECK-NEXT: add v0.4s, v1.4s, v0.4s +; CHECK-NEXT: ret + %l1 = load <4 x i12>, ptr %p + %q = getelementptr i8, ptr %p, i32 6 + %l2 = load <4 x i12>, ptr %q + %e1 = zext <4 x i12> %l1 to <4 x i32> + %e2 = zext <4 x i12> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <8 x i16> @load_v8i8(ptr %p) { +; CHECK-LABEL: load_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d1, d0, [x0] +; CHECK-NEXT: ushll v0.8h, v0.8b, #3 +; CHECK-NEXT: uaddw v0.8h, v0.8h, v1.8b +; CHECK-NEXT: ret + %l1 = load <8 x i8>, ptr %p + %q = getelementptr i8, ptr %p, i32 8 + %l2 = load <8 x i8>, ptr %q + %e1 = zext <8 x i8> %l1 to <8 x i16> + %e2 = zext <8 x i8> %l2 to <8 x i16> + %e3 = shl <8 x i16> %e2, + %a = add <8 x i16> %e1, %e3 + ret <8 x i16> %a +} + +define <8 x i16> @loadadd_v8i8(ptr %p1, ptr %p2) { +; CHECK-LABEL: loadadd_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldp d3, d2, [x1] +; CHECK-NEXT: add v0.8b, v0.8b, v3.8b +; CHECK-NEXT: add v1.8b, v1.8b, v2.8b +; CHECK-NEXT: ushll v1.8h, v1.8b, #3 +; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b +; CHECK-NEXT: ret + %l11 = load <8 x i8>, ptr %p1 + %q1 = getelementptr i8, ptr %p1, i32 8 + %l12 = load <8 x i8>, ptr %q1 + %l21 = load <8 x i8>, ptr %p2 + %q2 = getelementptr i8, ptr %p2, i32 8 + %l22 = load <8 x i8>, ptr %q2 + %l1 = add <8 x i8> %l11, %l21 + %l2 = add <8 x i8> %l12, %l22 + %e1 = zext <8 x i8> %l1 to <8 x i16> + %e2 = zext <8 x i8> %l2 to <8 x i16> + %e3 = shl <8 x i16> %e2, + %a = add <8 x i16> %e1, %e3 + ret <8 x i16> %a +} + +define <8 x i32> @loadaddext_v8i8(ptr %p1, ptr %p2) { +; CHECK-LABEL: loadaddext_v8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d2, d0, [x0] +; CHECK-NEXT: ldp d3, d1, [x1] +; CHECK-NEXT: uaddl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: ret + %l11 = load <8 x i8>, ptr %p1 + %q1 = getelementptr i8, ptr %p1, i32 8 + %l12 = load <8 x i8>, ptr %q1 + %l21 = load <8 x i8>, ptr %p2 + %q2 = getelementptr i8, ptr %p2, i32 8 + %l22 = load <8 x i8>, ptr %q2 + %le11 = zext <8 x i8> %l11 to <8 x i16> + %le12 = zext <8 x i8> %l12 to <8 x i16> + %le21 = zext <8 x i8> %l21 to <8 x i16> + %le22 = zext <8 x i8> %l22 to <8 x i16> + %l1 = add <8 x i16> %le11, %le21 + %l2 = add <8 x i16> %le12, %le22 + %e1 = zext <8 x i16> %l1 to <8 x i32> + %e2 = zext <8 x i16> %l2 to <8 x i32> + %e3 = shl <8 x i32> %e2, + %a = add <8 x i32> %e1, %e3 + ret <8 x i32> %a +} + +define <4 x i32> @loadaddext_v4i8(ptr %p1, ptr %p2) { +; CHECK-LABEL: loadaddext_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s2, s3, [x1] +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: add v1.4h, v1.4h, v3.4h +; CHECK-NEXT: add v0.4h, v0.4h, v2.4h +; CHECK-NEXT: ushll v1.4s, v1.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: ret + %l11 = load <4 x i8>, ptr %p1 + %q1 = getelementptr i8, ptr %p1, i32 4 + %l12 = load <4 x i8>, ptr %q1 + %l21 = load <4 x i8>, ptr %p2 + %q2 = getelementptr i8, ptr %p2, i32 4 + %l22 = load <4 x i8>, ptr %q2 + %le11 = zext <4 x i8> %l11 to <4 x i16> + %le12 = zext <4 x i8> %l12 to <4 x i16> + %le21 = zext <4 x i8> %l21 to <4 x i16> + %le22 = zext <4 x i8> %l22 to <4 x i16> + %l1 = add <4 x i16> %le11, %le21 + %l2 = add <4 x i16> %le12, %le22 + %e1 = zext <4 x i16> %l1 to <4 x i32> + %e2 = zext <4 x i16> %l2 to <4 x i32> + %e3 = shl <4 x i32> %e2, + %a = add <4 x i32> %e1, %e3 + ret <4 x i32> %a +} + +define <16 x i16> @load_v16i8(ptr %p) { +; CHECK-LABEL: load_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp q2, q0, [x0] +; CHECK-NEXT: ushll2 v1.8h, v0.16b, #3 +; CHECK-NEXT: ushll v0.8h, v0.8b, #3 +; CHECK-NEXT: uaddw2 v1.8h, v1.8h, v2.16b +; CHECK-NEXT: uaddw v0.8h, v0.8h, v2.8b +; CHECK-NEXT: ret + %l1 = load <16 x i8>, ptr %p + %q = getelementptr i8, ptr %p, i32 16 + %l2 = load <16 x i8>, ptr %q + %e1 = zext <16 x i8> %l1 to <16 x i16> + %e2 = zext <16 x i8> %l2 to <16 x i16> + %e3 = shl <16 x i16> %e2, + %a = add <16 x i16> %e1, %e3 + ret <16 x i16> %a +} + +define <2 x i16> @std_v2i8_v2i16(ptr %p) { +; CHECK-LABEL: std_v2i8_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldrb w8, [x0, #2] +; CHECK-NEXT: ldrb w9, [x0] +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: ldrb w8, [x0, #3] +; CHECK-NEXT: fmov s1, w9 +; CHECK-NEXT: ldrb w9, [x0, #1] +; CHECK-NEXT: mov v0.s[1], w8 +; CHECK-NEXT: mov v1.s[1], w9 +; CHECK-NEXT: shl v0.2s, v0.2s, #3 +; CHECK-NEXT: add v0.2s, v1.2s, v0.2s +; CHECK-NEXT: ret + %l1 = load <2 x i8>, ptr %p + %q = getelementptr i8, ptr %p, i32 2 + %l2 = load <2 x i8>, ptr %q + %e1 = zext <2 x i8> %l1 to <2 x i16> + %e2 = zext <2 x i8> %l2 to <2 x i16> + %se2 = shl <2 x i16> %e2, + %a = add <2 x i16> %e1, %se2 + ret <2 x i16> %a +} + +define <8 x i16> @load_bv_v4i8(ptr %p, ptr %q) { +; CHECK-LABEL: load_bv_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ushll v1.8h, v1.8b, #3 +; CHECK-NEXT: uaddw v0.8h, v1.8h, v0.8b +; CHECK-NEXT: ret + %j1 = load <4 x i8>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 4 + %j2 = load <4 x i8>, ptr %p1 + %k1 = load <4 x i8>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 4 + %k2 = load <4 x i8>, ptr %q1 + %l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> + %l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> + %e1 = zext <8 x i8> %l1 to <8 x i16> + %e2 = zext <8 x i8> %l2 to <8 x i16> + %e3 = shl <8 x i16> %e2, + %a = add <8 x i16> %e1, %e3 + ret <8 x i16> %a +} + +define <8 x i32> @load_bv_v4i8_i32(ptr %p, ptr %q) { +; CHECK-LABEL: load_bv_v4i8_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 +; CHECK-NEXT: ushll v3.4s, v1.4h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h +; CHECK-NEXT: uaddw v0.4s, v3.4s, v0.4h +; CHECK-NEXT: ret + %j1 = load <4 x i8>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 4 + %j2 = load <4 x i8>, ptr %p1 + %k1 = load <4 x i8>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 4 + %k2 = load <4 x i8>, ptr %q1 + %l1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> + %l2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> + %e1 = zext <8 x i8> %l1 to <8 x i32> + %e2 = zext <8 x i8> %l2 to <8 x i32> + %e3 = shl <8 x i32> %e2, + %a = add <8 x i32> %e1, %e3 + ret <8 x i32> %a +} + +define <8 x i32> @load_bv_v4i16_i32(ptr %p, ptr %q) { +; CHECK-LABEL: load_bv_v4i16_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldp d3, d2, [x1] +; CHECK-NEXT: ushll v1.4s, v1.4h, #3 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: uaddw v1.4s, v2.4s, v3.4h +; CHECK-NEXT: ret + %j1 = load <4 x i16>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 8 + %j2 = load <4 x i16>, ptr %p1 + %k1 = load <4 x i16>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 8 + %k2 = load <4 x i16>, ptr %q1 + %l1 = shufflevector <4 x i16> %j1, <4 x i16> %k1, <8 x i32> + %l2 = shufflevector <4 x i16> %j2, <4 x i16> %k2, <8 x i32> + %e1 = zext <8 x i16> %l1 to <8 x i32> + %e2 = zext <8 x i16> %l2 to <8 x i32> + %e3 = shl <8 x i32> %e2, + %a = add <8 x i32> %e1, %e3 + ret <8 x i32> %a +} + +define <12 x i32> @load_bv_3xv4i8_i32(ptr %p, ptr %q, ptr %r) { +; CHECK-LABEL: load_bv_3xv4i8_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldp s3, s2, [x2] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ushll v2.8h, v2.8b, #0 +; CHECK-NEXT: ushll v3.8h, v3.8b, #0 +; CHECK-NEXT: ushll v2.4s, v2.4h, #3 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: uaddw v2.4s, v2.4s, v3.4h +; CHECK-NEXT: ushll2 v3.4s, v1.8h, #3 +; CHECK-NEXT: ushll v1.4s, v1.4h, #3 +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v0.8h +; CHECK-NEXT: uaddw v0.4s, v1.4s, v0.4h +; CHECK-NEXT: stp q3, q2, [x8, #16] +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: ret + %j1 = load <4 x i8>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 4 + %j2 = load <4 x i8>, ptr %p1 + %k1 = load <4 x i8>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 4 + %k2 = load <4 x i8>, ptr %q1 + %m1 = load <4 x i8>, ptr %r + %r1 = getelementptr i8, ptr %r, i32 4 + %m2 = load <4 x i8>, ptr %r1 + %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> + %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> + %mn1 = shufflevector <4 x i8> %m1, <4 x i8> undef, <8 x i32> + %mn2 = shufflevector <4 x i8> %m2, <4 x i8> undef, <8 x i32> + %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <12 x i32> + %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <12 x i32> + %e1 = zext <12 x i8> %l1 to <12 x i32> + %e2 = zext <12 x i8> %l2 to <12 x i32> + %e3 = shl <12 x i32> %e2, + %a = add <12 x i32> %e1, %e3 + ret <12 x i32> %a +} + +define <16 x i16> @load_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) { +; CHECK-LABEL: load_bv_4xv4i8_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 +; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ld1 { v3.s }[1], [x3] +; CHECK-NEXT: uaddl v1.8h, v2.8b, v3.8b +; CHECK-NEXT: ret + %j1 = load <4 x i8>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 4 + %j2 = load <4 x i8>, ptr %p1 + %k1 = load <4 x i8>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 4 + %k2 = load <4 x i8>, ptr %q1 + %m1 = load <4 x i8>, ptr %r + %r1 = getelementptr i8, ptr %r, i32 4 + %m2 = load <4 x i8>, ptr %r1 + %n1 = load <4 x i8>, ptr %s + %s1 = getelementptr i8, ptr %s, i32 4 + %n2 = load <4 x i8>, ptr %s1 + %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> + %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> + %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> + %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> + %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> + %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> + %e1 = zext <16 x i8> %l1 to <16 x i16> + %e2 = zext <16 x i8> %l2 to <16 x i16> + %e3 = shl <16 x i16> %e2, + %a = add <16 x i16> %e1, %e2 + ret <16 x i16> %a +} + +define <8 x i32> @double_bv_2xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s) { +; CHECK-LABEL: double_bv_2xv4i8_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ld1 { v3.s }[1], [x3] +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: shll2 v1.4s, v2.8h, #16 +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h +; CHECK-NEXT: saddw v0.4s, v2.4s, v0.4h +; CHECK-NEXT: ret + %j1 = load <4 x i8>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 4 + %j2 = load <4 x i8>, ptr %p1 + %k1 = load <4 x i8>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 4 + %k2 = load <4 x i8>, ptr %q1 + %m1 = load <4 x i8>, ptr %r + %r1 = getelementptr i8, ptr %r, i32 4 + %m2 = load <4 x i8>, ptr %r1 + %n1 = load <4 x i8>, ptr %s + %s1 = getelementptr i8, ptr %s, i32 4 + %n2 = load <4 x i8>, ptr %s1 + %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> + %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> + %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> + %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> + %ejk1 = zext <8 x i8> %jk1 to <8 x i16> + %ejk2 = zext <8 x i8> %jk2 to <8 x i16> + %ajk = sub <8 x i16> %ejk1, %ejk2 + %enm1 = zext <8 x i8> %mn1 to <8 x i16> + %enm2 = zext <8 x i8> %mn2 to <8 x i16> + %anm = sub <8 x i16> %enm1, %enm2 + %x = sext <8 x i16> %ajk to <8 x i32> + %y = zext <8 x i16> %anm to <8 x i32> + %ys = shl <8 x i32> %y, + %a = add <8 x i32> %x, %ys + ret <8 x i32> %a +} + +define <16 x i32> @double_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) { +; CHECK-LABEL: double_bv_4xv4i8_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 +; CHECK-NEXT: ldp s4, s5, [x4] +; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b +; CHECK-NEXT: ld1 { v3.s }[1], [x3] +; CHECK-NEXT: ld1 { v4.s }[1], [x5], #4 +; CHECK-NEXT: ldp s6, s7, [x6] +; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: ld1 { v5.s }[1], [x5] +; CHECK-NEXT: ld1 { v6.s }[1], [x7], #4 +; CHECK-NEXT: usubl v4.8h, v4.8b, v5.8b +; CHECK-NEXT: ld1 { v7.s }[1], [x7] +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll2 v1.4s, v4.8h, #16 +; CHECK-NEXT: usubl v4.8h, v6.8b, v7.8b +; CHECK-NEXT: saddw2 v1.4s, v1.4s, v0.8h +; CHECK-NEXT: saddw v0.4s, v3.4s, v0.4h +; CHECK-NEXT: shll2 v3.4s, v4.8h, #16 +; CHECK-NEXT: shll v4.4s, v4.4h, #16 +; CHECK-NEXT: saddw2 v3.4s, v3.4s, v2.8h +; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h +; CHECK-NEXT: ret + %j1 = load <4 x i8>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 4 + %j2 = load <4 x i8>, ptr %p1 + %k1 = load <4 x i8>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 4 + %k2 = load <4 x i8>, ptr %q1 + %m1 = load <4 x i8>, ptr %r + %r1 = getelementptr i8, ptr %r, i32 4 + %m2 = load <4 x i8>, ptr %r1 + %n1 = load <4 x i8>, ptr %s + %s1 = getelementptr i8, ptr %s, i32 4 + %n2 = load <4 x i8>, ptr %s1 + %j3 = load <4 x i8>, ptr %t + %t3 = getelementptr i8, ptr %t, i32 4 + %j4 = load <4 x i8>, ptr %t3 + %k3 = load <4 x i8>, ptr %u + %u3 = getelementptr i8, ptr %u, i32 4 + %k4 = load <4 x i8>, ptr %u3 + %m3 = load <4 x i8>, ptr %v + %v3 = getelementptr i8, ptr %v, i32 4 + %m4 = load <4 x i8>, ptr %v3 + %n3 = load <4 x i8>, ptr %w + %w3 = getelementptr i8, ptr %w, i32 4 + %n4 = load <4 x i8>, ptr %w3 + %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <8 x i32> + %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <8 x i32> + %mn1 = shufflevector <4 x i8> %m1, <4 x i8> %n1, <8 x i32> + %mn2 = shufflevector <4 x i8> %m2, <4 x i8> %n2, <8 x i32> + %jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <8 x i32> + %jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <8 x i32> + %mn3 = shufflevector <4 x i8> %m3, <4 x i8> %n3, <8 x i32> + %mn4 = shufflevector <4 x i8> %m4, <4 x i8> %n4, <8 x i32> + %l1 = shufflevector <8 x i8> %jk1, <8 x i8> %mn1, <16 x i32> + %l2 = shufflevector <8 x i8> %jk2, <8 x i8> %mn2, <16 x i32> + %l3 = shufflevector <8 x i8> %jk3, <8 x i8> %mn3, <16 x i32> + %l4 = shufflevector <8 x i8> %jk4, <8 x i8> %mn4, <16 x i32> + %ejk1 = zext <16 x i8> %l1 to <16 x i16> + %ejk2 = zext <16 x i8> %l2 to <16 x i16> + %ajk = sub <16 x i16> %ejk1, %ejk2 + %enm1 = zext <16 x i8> %l3 to <16 x i16> + %enm2 = zext <16 x i8> %l4 to <16 x i16> + %anm = sub <16 x i16> %enm1, %enm2 + %x = sext <16 x i16> %ajk to <16 x i32> + %y = zext <16 x i16> %anm to <16 x i32> + %ys = shl <16 x i32> %y, + %a = add <16 x i32> %x, %ys + ret <16 x i32> %a +} + +define <16 x i32> @double2_bv_4xv4i8_i32(ptr %p, ptr %q, ptr %r, ptr %s, ptr %t, ptr %u, ptr %v, ptr %w) { +; CHECK-LABEL: double2_bv_4xv4i8_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x2] +; CHECK-NEXT: ldp s2, s3, [x0] +; CHECK-NEXT: ldp s4, s5, [x6] +; CHECK-NEXT: ldp s6, s7, [x4] +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v4.s }[1], [x7], #4 +; CHECK-NEXT: ld1 { v6.s }[1], [x5], #4 +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: ld1 { v3.s }[1], [x1] +; CHECK-NEXT: ld1 { v5.s }[1], [x7] +; CHECK-NEXT: ld1 { v7.s }[1], [x5] +; CHECK-NEXT: usubl v2.8h, v2.8b, v6.8b +; CHECK-NEXT: usubl v4.8h, v0.8b, v4.8b +; CHECK-NEXT: usubl v1.8h, v1.8b, v5.8b +; CHECK-NEXT: usubl v3.8h, v3.8b, v7.8b +; CHECK-NEXT: shll v5.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v3.4h, #16 +; CHECK-NEXT: shll2 v3.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v1.8h, #16 +; CHECK-NEXT: saddw2 v1.4s, v3.4s, v2.8h +; CHECK-NEXT: saddw2 v3.4s, v6.4s, v4.8h +; CHECK-NEXT: saddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: saddw v2.4s, v5.4s, v4.4h +; CHECK-NEXT: ret + %j1 = load <4 x i8>, ptr %p + %p1 = getelementptr i8, ptr %p, i32 4 + %j2 = load <4 x i8>, ptr %p1 + %k1 = load <4 x i8>, ptr %q + %q1 = getelementptr i8, ptr %q, i32 4 + %k2 = load <4 x i8>, ptr %q1 + %m1 = load <4 x i8>, ptr %r + %r1 = getelementptr i8, ptr %r, i32 4 + %m2 = load <4 x i8>, ptr %r1 + %n1 = load <4 x i8>, ptr %s + %s1 = getelementptr i8, ptr %s, i32 4 + %n2 = load <4 x i8>, ptr %s1 + %j3 = load <4 x i8>, ptr %t + %t3 = getelementptr i8, ptr %t, i32 4 + %j4 = load <4 x i8>, ptr %t3 + %k3 = load <4 x i8>, ptr %u + %u3 = getelementptr i8, ptr %u, i32 4 + %k4 = load <4 x i8>, ptr %u3 + %m3 = load <4 x i8>, ptr %v + %v3 = getelementptr i8, ptr %v, i32 4 + %m4 = load <4 x i8>, ptr %v3 + %n3 = load <4 x i8>, ptr %w + %w3 = getelementptr i8, ptr %w, i32 4 + %n4 = load <4 x i8>, ptr %w3 + %jk1 = shufflevector <4 x i8> %j1, <4 x i8> %k1, <16 x i32> + %m1l = shufflevector <4 x i8> %m1, <4 x i8> poison, <16 x i32> + %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> + %n1l = shufflevector <4 x i8> %n1, <4 x i8> poison, <16 x i32> + %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> + %jk2 = shufflevector <4 x i8> %j2, <4 x i8> %k2, <16 x i32> + %m2l = shufflevector <4 x i8> %m2, <4 x i8> poison, <16 x i32> + %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> + %n2l = shufflevector <4 x i8> %n2, <4 x i8> poison, <16 x i32> + %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> + %jk3 = shufflevector <4 x i8> %j3, <4 x i8> %k3, <16 x i32> + %m3l = shufflevector <4 x i8> %m3, <4 x i8> poison, <16 x i32> + %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> + %n3l = shufflevector <4 x i8> %n3, <4 x i8> poison, <16 x i32> + %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> + %jk4 = shufflevector <4 x i8> %j4, <4 x i8> %k4, <16 x i32> + %m4l = shufflevector <4 x i8> %m4, <4 x i8> poison, <16 x i32> + %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> + %n4l = shufflevector <4 x i8> %n4, <4 x i8> poison, <16 x i32> + %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> + %ejk1 = zext <16 x i8> %l1 to <16 x i16> + %ejk2 = zext <16 x i8> %l3 to <16 x i16> + %ajk = sub <16 x i16> %ejk1, %ejk2 + %enm1 = zext <16 x i8> %l2 to <16 x i16> + %enm2 = zext <16 x i8> %l4 to <16 x i16> + %anm = sub <16 x i16> %enm1, %enm2 + %x = sext <16 x i16> %ajk to <16 x i32> + %y = zext <16 x i16> %anm to <16 x i32> + %ys = shl <16 x i32> %y, + %a = add <16 x i32> %x, %ys + ret <16 x i32> %a +} + +define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { +; CHECK-LABEL: extrause_load: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: add x10, x1, #4 +; CHECK-NEXT: add x11, x1, #8 +; CHECK-NEXT: add x12, x1, #12 +; CHECK-NEXT: str s0, [x4] +; CHECK-NEXT: ldp s1, s5, [x2] +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: umov w9, v1.h[1] +; CHECK-NEXT: mov v2.b[8], w8 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: mov v2.b[9], w9 +; CHECK-NEXT: umov w9, v1.h[3] +; CHECK-NEXT: ldr s1, [x1] +; CHECK-NEXT: mov v2.b[10], w8 +; CHECK-NEXT: add x8, x3, #8 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b +; CHECK-NEXT: mov v2.b[11], w9 +; CHECK-NEXT: add x9, x3, #12 +; CHECK-NEXT: ld1 { v2.s }[3], [x3], #4 +; CHECK-NEXT: ldp s3, s4, [x0, #4] +; CHECK-NEXT: ldp s6, s7, [x2, #8] +; CHECK-NEXT: ldr s16, [x0, #12] +; CHECK-NEXT: ld1 { v5.s }[1], [x3] +; CHECK-NEXT: ld1 { v4.s }[1], [x11] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: ld1 { v3.s }[1], [x10] +; CHECK-NEXT: ld1 { v16.s }[1], [x12] +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: ushll v1.8h, v6.8b, #0 +; CHECK-NEXT: uaddl v0.8h, v0.8b, v4.8b +; CHECK-NEXT: uaddl v3.8h, v3.8b, v16.8b +; CHECK-NEXT: uaddl v4.8h, v5.8b, v7.8b +; CHECK-NEXT: uaddw2 v2.8h, v1.8h, v2.16b +; CHECK-NEXT: ushll v5.4s, v3.4h, #3 +; CHECK-NEXT: ushll2 v1.4s, v3.8h, #3 +; CHECK-NEXT: ushll v6.4s, v4.4h, #3 +; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h +; CHECK-NEXT: uaddw v0.4s, v5.4s, v0.4h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h +; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h +; CHECK-NEXT: ret + %lp1 = load <4 x i8>, ptr %p + store <4 x i8> %lp1, ptr %z + %p2 = getelementptr i8, ptr %p, i32 4 + %lp2 = load <4 x i8>, ptr %p2 + %p3 = getelementptr i8, ptr %p, i32 8 + %lp3 = load <4 x i8>, ptr %p3 + %p4 = getelementptr i8, ptr %p, i32 12 + %lp4 = load <4 x i8>, ptr %p4 + %lq1 = load <4 x i8>, ptr %q + %q2 = getelementptr i8, ptr %q, i32 4 + %lq2 = load <4 x i8>, ptr %q2 + %q3 = getelementptr i8, ptr %q, i32 8 + %lq3 = load <4 x i8>, ptr %q3 + %q4 = getelementptr i8, ptr %q, i32 12 + %lq4 = load <4 x i8>, ptr %q4 + %lr1 = load <4 x i8>, ptr %r + %r2 = getelementptr i8, ptr %r, i32 4 + %lr2 = load <4 x i8>, ptr %r2 + %r3 = getelementptr i8, ptr %r, i32 8 + %lr3 = load <4 x i8>, ptr %r3 + %r4 = getelementptr i8, ptr %r, i32 12 + %lr4 = load <4 x i8>, ptr %r4 + %ls1 = load <4 x i8>, ptr %s + %s2 = getelementptr i8, ptr %s, i32 4 + %ls2 = load <4 x i8>, ptr %s2 + %s3 = getelementptr i8, ptr %s, i32 8 + %ls3 = load <4 x i8>, ptr %s3 + %s4 = getelementptr i8, ptr %s, i32 12 + %ls4 = load <4 x i8>, ptr %s4 + + %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> + %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> + %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> + %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> + %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> + %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> + %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> + %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> + %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> + %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> + %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> + %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> + %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> + %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> + %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> + %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> + %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> + %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> + %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> + %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> + + %le11 = zext <16 x i8> %l1 to <16 x i16> + %le12 = zext <16 x i8> %l3 to <16 x i16> + %le21 = zext <16 x i8> %l2 to <16 x i16> + %le22 = zext <16 x i8> %l4 to <16 x i16> + %la1 = add <16 x i16> %le11, %le12 + %la2 = add <16 x i16> %le21, %le22 + %e1 = zext <16 x i16> %la1 to <16 x i32> + %e2 = zext <16 x i16> %la2 to <16 x i32> + %se2 = shl <16 x i32> %e2, + %a = add <16 x i32> %e1, %se2 + ret <16 x i32> %a +} + +define <16 x i32> @extrause_shuffle(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { +; CHECK-LABEL: extrause_shuffle: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x2] +; CHECK-NEXT: ldp s2, s3, [x0] +; CHECK-NEXT: ldp s6, s7, [x0, #8] +; CHECK-NEXT: ldr s18, [x1, #12] +; CHECK-NEXT: add x8, x3, #8 +; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: ldr s5, [x3, #12] +; CHECK-NEXT: ldp s16, s17, [x2, #8] +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 +; CHECK-NEXT: mov v4.16b, v7.16b +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: mov v4.s[1], v18.s[0] +; CHECK-NEXT: ld1 { v16.s }[1], [x8] +; CHECK-NEXT: mov v7.s[1], v18.s[0] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: ld1 { v3.s }[1], [x1] +; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b +; CHECK-NEXT: mov v4.s[2], v17.s[0] +; CHECK-NEXT: mov v17.s[1], v5.s[0] +; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b +; CHECK-NEXT: uaddl v6.8h, v0.8b, v16.8b +; CHECK-NEXT: mov v4.s[3], v5.s[0] +; CHECK-NEXT: uaddl v7.8h, v1.8b, v17.8b +; CHECK-NEXT: ushll2 v0.4s, v3.8h, #3 +; CHECK-NEXT: ushll v3.4s, v3.4h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v0.4s, v2.8h +; CHECK-NEXT: str q4, [x4] +; CHECK-NEXT: uaddw v0.4s, v3.4s, v2.4h +; CHECK-NEXT: ushll2 v2.4s, v7.8h, #3 +; CHECK-NEXT: ushll v7.4s, v7.4h, #3 +; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v6.8h +; CHECK-NEXT: uaddw v2.4s, v7.4s, v6.4h +; CHECK-NEXT: ret + %lp1 = load <4 x i8>, ptr %p + %p2 = getelementptr i8, ptr %p, i32 4 + %lp2 = load <4 x i8>, ptr %p2 + %p3 = getelementptr i8, ptr %p, i32 8 + %lp3 = load <4 x i8>, ptr %p3 + %p4 = getelementptr i8, ptr %p, i32 12 + %lp4 = load <4 x i8>, ptr %p4 + %lq1 = load <4 x i8>, ptr %q + %q2 = getelementptr i8, ptr %q, i32 4 + %lq2 = load <4 x i8>, ptr %q2 + %q3 = getelementptr i8, ptr %q, i32 8 + %lq3 = load <4 x i8>, ptr %q3 + %q4 = getelementptr i8, ptr %q, i32 12 + %lq4 = load <4 x i8>, ptr %q4 + %lr1 = load <4 x i8>, ptr %r + %r2 = getelementptr i8, ptr %r, i32 4 + %lr2 = load <4 x i8>, ptr %r2 + %r3 = getelementptr i8, ptr %r, i32 8 + %lr3 = load <4 x i8>, ptr %r3 + %r4 = getelementptr i8, ptr %r, i32 12 + %lr4 = load <4 x i8>, ptr %r4 + %ls1 = load <4 x i8>, ptr %s + %s2 = getelementptr i8, ptr %s, i32 4 + %ls2 = load <4 x i8>, ptr %s2 + %s3 = getelementptr i8, ptr %s, i32 8 + %ls3 = load <4 x i8>, ptr %s3 + %s4 = getelementptr i8, ptr %s, i32 12 + %ls4 = load <4 x i8>, ptr %s4 + + %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> + %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> + %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> + %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> + %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> + %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> + %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> + %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> + %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> + %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> + %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> + %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> + %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> + %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> + %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> + %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> + %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> + %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> + %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> + %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> + store <16 x i8> %l4, ptr %z + + %le11 = zext <16 x i8> %l1 to <16 x i16> + %le12 = zext <16 x i8> %l3 to <16 x i16> + %le21 = zext <16 x i8> %l2 to <16 x i16> + %le22 = zext <16 x i8> %l4 to <16 x i16> + %la1 = add <16 x i16> %le11, %le12 + %la2 = add <16 x i16> %le21, %le22 + %e1 = zext <16 x i16> %la1 to <16 x i32> + %e2 = zext <16 x i16> %la2 to <16 x i32> + %se2 = shl <16 x i32> %e2, + %a = add <16 x i32> %e1, %se2 + ret <16 x i32> %a +} + +define <16 x i32> @extrause_ext(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { +; CHECK-LABEL: extrause_ext: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x2] +; CHECK-NEXT: ldp s2, s3, [x0] +; CHECK-NEXT: add x8, x3, #8 +; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: ldp s5, s6, [x2, #8] +; CHECK-NEXT: add x10, x1, #12 +; CHECK-NEXT: add x11, x3, #12 +; CHECK-NEXT: ldp s7, s4, [x0, #8] +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v6.s }[1], [x11] +; CHECK-NEXT: ld1 { v4.s }[1], [x10] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: ld1 { v3.s }[1], [x1] +; CHECK-NEXT: ld1 { v7.s }[1], [x9] +; CHECK-NEXT: ld1 { v5.s }[1], [x8] +; CHECK-NEXT: uaddl v1.8h, v1.8b, v6.8b +; CHECK-NEXT: uaddl v3.8h, v3.8b, v4.8b +; CHECK-NEXT: uaddl v2.8h, v2.8b, v7.8b +; CHECK-NEXT: uaddl v5.8h, v0.8b, v5.8b +; CHECK-NEXT: ushll v7.4s, v1.4h, #3 +; CHECK-NEXT: ushll v0.4s, v3.4h, #3 +; CHECK-NEXT: ushll2 v3.4s, v3.8h, #3 +; CHECK-NEXT: ushll2 v16.4s, v1.8h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v3.4s, v2.8h +; CHECK-NEXT: uaddw2 v3.4s, v16.4s, v5.8h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: uaddw v2.4s, v7.4s, v5.4h +; CHECK-NEXT: ushll v5.8h, v6.8b, #0 +; CHECK-NEXT: ushll v4.8h, v4.8b, #0 +; CHECK-NEXT: stp q4, q5, [x4] +; CHECK-NEXT: ret + %lp1 = load <4 x i8>, ptr %p + %p2 = getelementptr i8, ptr %p, i32 4 + %lp2 = load <4 x i8>, ptr %p2 + %p3 = getelementptr i8, ptr %p, i32 8 + %lp3 = load <4 x i8>, ptr %p3 + %p4 = getelementptr i8, ptr %p, i32 12 + %lp4 = load <4 x i8>, ptr %p4 + %lq1 = load <4 x i8>, ptr %q + %q2 = getelementptr i8, ptr %q, i32 4 + %lq2 = load <4 x i8>, ptr %q2 + %q3 = getelementptr i8, ptr %q, i32 8 + %lq3 = load <4 x i8>, ptr %q3 + %q4 = getelementptr i8, ptr %q, i32 12 + %lq4 = load <4 x i8>, ptr %q4 + %lr1 = load <4 x i8>, ptr %r + %r2 = getelementptr i8, ptr %r, i32 4 + %lr2 = load <4 x i8>, ptr %r2 + %r3 = getelementptr i8, ptr %r, i32 8 + %lr3 = load <4 x i8>, ptr %r3 + %r4 = getelementptr i8, ptr %r, i32 12 + %lr4 = load <4 x i8>, ptr %r4 + %ls1 = load <4 x i8>, ptr %s + %s2 = getelementptr i8, ptr %s, i32 4 + %ls2 = load <4 x i8>, ptr %s2 + %s3 = getelementptr i8, ptr %s, i32 8 + %ls3 = load <4 x i8>, ptr %s3 + %s4 = getelementptr i8, ptr %s, i32 12 + %ls4 = load <4 x i8>, ptr %s4 + + %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> + %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> + %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> + %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> + %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> + %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> + %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> + %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> + %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> + %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> + %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> + %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> + %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> + %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> + %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> + %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> + %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> + %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> + %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> + %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> + + %le11 = zext <16 x i8> %l1 to <16 x i16> + %le12 = zext <16 x i8> %l3 to <16 x i16> + %le21 = zext <16 x i8> %l2 to <16 x i16> + %le22 = zext <16 x i8> %l4 to <16 x i16> + store <16 x i16> %le22, ptr %z + %la1 = add <16 x i16> %le11, %le12 + %la2 = add <16 x i16> %le21, %le22 + %e1 = zext <16 x i16> %la1 to <16 x i32> + %e2 = zext <16 x i16> %la2 to <16 x i32> + %se2 = shl <16 x i32> %e2, + %a = add <16 x i32> %e1, %se2 + ret <16 x i32> %a +} + +define <16 x i32> @extrause_add(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { +; CHECK-LABEL: extrause_add: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: add x8, x3, #8 +; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: ldp s4, s5, [x0, #8] +; CHECK-NEXT: add x10, x1, #12 +; CHECK-NEXT: add x11, x3, #12 +; CHECK-NEXT: ldp s6, s7, [x2, #8] +; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: ld1 { v7.s }[1], [x11] +; CHECK-NEXT: ld1 { v3.s }[1], [x3] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v5.s }[1], [x10] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: uaddl v0.8h, v0.8b, v4.8b +; CHECK-NEXT: uaddl v4.8h, v3.8b, v7.8b +; CHECK-NEXT: uaddl v5.8h, v1.8b, v5.8b +; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b +; CHECK-NEXT: ushll v6.4s, v4.4h, #3 +; CHECK-NEXT: ushll v7.4s, v5.4h, #3 +; CHECK-NEXT: stp q5, q4, [x4] +; CHECK-NEXT: ushll2 v1.4s, v5.8h, #3 +; CHECK-NEXT: ushll2 v3.4s, v4.8h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v0.8h +; CHECK-NEXT: uaddw2 v3.4s, v3.4s, v2.8h +; CHECK-NEXT: uaddw v0.4s, v7.4s, v0.4h +; CHECK-NEXT: uaddw v2.4s, v6.4s, v2.4h +; CHECK-NEXT: ret + %lp1 = load <4 x i8>, ptr %p + %p2 = getelementptr i8, ptr %p, i32 4 + %lp2 = load <4 x i8>, ptr %p2 + %p3 = getelementptr i8, ptr %p, i32 8 + %lp3 = load <4 x i8>, ptr %p3 + %p4 = getelementptr i8, ptr %p, i32 12 + %lp4 = load <4 x i8>, ptr %p4 + %lq1 = load <4 x i8>, ptr %q + %q2 = getelementptr i8, ptr %q, i32 4 + %lq2 = load <4 x i8>, ptr %q2 + %q3 = getelementptr i8, ptr %q, i32 8 + %lq3 = load <4 x i8>, ptr %q3 + %q4 = getelementptr i8, ptr %q, i32 12 + %lq4 = load <4 x i8>, ptr %q4 + %lr1 = load <4 x i8>, ptr %r + %r2 = getelementptr i8, ptr %r, i32 4 + %lr2 = load <4 x i8>, ptr %r2 + %r3 = getelementptr i8, ptr %r, i32 8 + %lr3 = load <4 x i8>, ptr %r3 + %r4 = getelementptr i8, ptr %r, i32 12 + %lr4 = load <4 x i8>, ptr %r4 + %ls1 = load <4 x i8>, ptr %s + %s2 = getelementptr i8, ptr %s, i32 4 + %ls2 = load <4 x i8>, ptr %s2 + %s3 = getelementptr i8, ptr %s, i32 8 + %ls3 = load <4 x i8>, ptr %s3 + %s4 = getelementptr i8, ptr %s, i32 12 + %ls4 = load <4 x i8>, ptr %s4 + + %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> + %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> + %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> + %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> + %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> + %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> + %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> + %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> + %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> + %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> + %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> + %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> + %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> + %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> + %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> + %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> + %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> + %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> + %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> + %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> + + %le11 = zext <16 x i8> %l1 to <16 x i16> + %le12 = zext <16 x i8> %l3 to <16 x i16> + %le21 = zext <16 x i8> %l2 to <16 x i16> + %le22 = zext <16 x i8> %l4 to <16 x i16> + %la1 = add <16 x i16> %le11, %le12 + %la2 = add <16 x i16> %le21, %le22 + store <16 x i16> %la2, ptr %z + %e1 = zext <16 x i16> %la1 to <16 x i32> + %e2 = zext <16 x i16> %la2 to <16 x i32> + %se2 = shl <16 x i32> %e2, + %a = add <16 x i32> %e1, %se2 + ret <16 x i32> %a +} + +define <16 x i32> @extrause_ext2(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { +; CHECK-LABEL: extrause_ext2: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x2] +; CHECK-NEXT: ldp s2, s3, [x0] +; CHECK-NEXT: add x8, x3, #8 +; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: ldp s4, s5, [x2, #8] +; CHECK-NEXT: add x10, x1, #12 +; CHECK-NEXT: add x11, x3, #12 +; CHECK-NEXT: ldp s6, s7, [x0, #8] +; CHECK-NEXT: ld1 { v0.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v2.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v5.s }[1], [x11] +; CHECK-NEXT: ld1 { v6.s }[1], [x9] +; CHECK-NEXT: ld1 { v7.s }[1], [x10] +; CHECK-NEXT: ld1 { v3.s }[1], [x1] +; CHECK-NEXT: ld1 { v1.s }[1], [x3] +; CHECK-NEXT: ld1 { v4.s }[1], [x8] +; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b +; CHECK-NEXT: uaddl v6.8h, v3.8b, v7.8b +; CHECK-NEXT: uaddl v5.8h, v1.8b, v5.8b +; CHECK-NEXT: uaddl v4.8h, v0.8b, v4.8b +; CHECK-NEXT: ushll2 v0.4s, v6.8h, #3 +; CHECK-NEXT: ushll v3.4s, v6.4h, #3 +; CHECK-NEXT: ushll v7.4s, v5.4h, #0 +; CHECK-NEXT: ushll2 v16.4s, v5.8h, #0 +; CHECK-NEXT: uaddw2 v1.4s, v0.4s, v2.8h +; CHECK-NEXT: uaddw v0.4s, v3.4s, v2.4h +; CHECK-NEXT: stp q7, q16, [x4, #32] +; CHECK-NEXT: ushll2 v2.4s, v5.8h, #3 +; CHECK-NEXT: ushll v5.4s, v5.4h, #3 +; CHECK-NEXT: uaddw2 v3.4s, v2.4s, v4.8h +; CHECK-NEXT: uaddw v2.4s, v5.4s, v4.4h +; CHECK-NEXT: ushll2 v4.4s, v6.8h, #0 +; CHECK-NEXT: ushll v5.4s, v6.4h, #0 +; CHECK-NEXT: stp q5, q4, [x4] +; CHECK-NEXT: ret + %lp1 = load <4 x i8>, ptr %p + %p2 = getelementptr i8, ptr %p, i32 4 + %lp2 = load <4 x i8>, ptr %p2 + %p3 = getelementptr i8, ptr %p, i32 8 + %lp3 = load <4 x i8>, ptr %p3 + %p4 = getelementptr i8, ptr %p, i32 12 + %lp4 = load <4 x i8>, ptr %p4 + %lq1 = load <4 x i8>, ptr %q + %q2 = getelementptr i8, ptr %q, i32 4 + %lq2 = load <4 x i8>, ptr %q2 + %q3 = getelementptr i8, ptr %q, i32 8 + %lq3 = load <4 x i8>, ptr %q3 + %q4 = getelementptr i8, ptr %q, i32 12 + %lq4 = load <4 x i8>, ptr %q4 + %lr1 = load <4 x i8>, ptr %r + %r2 = getelementptr i8, ptr %r, i32 4 + %lr2 = load <4 x i8>, ptr %r2 + %r3 = getelementptr i8, ptr %r, i32 8 + %lr3 = load <4 x i8>, ptr %r3 + %r4 = getelementptr i8, ptr %r, i32 12 + %lr4 = load <4 x i8>, ptr %r4 + %ls1 = load <4 x i8>, ptr %s + %s2 = getelementptr i8, ptr %s, i32 4 + %ls2 = load <4 x i8>, ptr %s2 + %s3 = getelementptr i8, ptr %s, i32 8 + %ls3 = load <4 x i8>, ptr %s3 + %s4 = getelementptr i8, ptr %s, i32 12 + %ls4 = load <4 x i8>, ptr %s4 + + %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> + %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> + %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> + %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> + %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> + %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> + %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> + %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> + %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> + %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> + %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> + %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> + %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> + %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> + %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> + %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> + %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> + %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> + %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> + %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> + + %le11 = zext <16 x i8> %l1 to <16 x i16> + %le12 = zext <16 x i8> %l3 to <16 x i16> + %le21 = zext <16 x i8> %l2 to <16 x i16> + %le22 = zext <16 x i8> %l4 to <16 x i16> + %la1 = add <16 x i16> %le11, %le12 + %la2 = add <16 x i16> %le21, %le22 + %e1 = zext <16 x i16> %la1 to <16 x i32> + %e2 = zext <16 x i16> %la2 to <16 x i32> + store <16 x i32> %e2, ptr %z + %se2 = shl <16 x i32> %e2, + %a = add <16 x i32> %e1, %se2 + ret <16 x i32> %a +} + +define <16 x i32> @extrause_shl(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) { +; CHECK-LABEL: extrause_shl: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp s0, s1, [x0] +; CHECK-NEXT: ldp s2, s3, [x2] +; CHECK-NEXT: add x8, x3, #8 +; CHECK-NEXT: add x9, x1, #8 +; CHECK-NEXT: ldp s4, s5, [x0, #8] +; CHECK-NEXT: add x10, x1, #12 +; CHECK-NEXT: add x11, x3, #12 +; CHECK-NEXT: ldp s6, s7, [x2, #8] +; CHECK-NEXT: ld1 { v2.s }[1], [x3], #4 +; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4 +; CHECK-NEXT: ld1 { v5.s }[1], [x10] +; CHECK-NEXT: ld1 { v7.s }[1], [x11] +; CHECK-NEXT: ld1 { v3.s }[1], [x3] +; CHECK-NEXT: ld1 { v1.s }[1], [x1] +; CHECK-NEXT: ld1 { v4.s }[1], [x9] +; CHECK-NEXT: ld1 { v6.s }[1], [x8] +; CHECK-NEXT: uaddl v3.8h, v3.8b, v7.8b +; CHECK-NEXT: uaddl v1.8h, v1.8b, v5.8b +; CHECK-NEXT: uaddl v0.8h, v0.8b, v4.8b +; CHECK-NEXT: uaddl v2.8h, v2.8b, v6.8b +; CHECK-NEXT: ushll v4.4s, v1.4h, #3 +; CHECK-NEXT: ushll v5.4s, v3.4h, #3 +; CHECK-NEXT: ushll2 v6.4s, v1.8h, #3 +; CHECK-NEXT: ushll2 v7.4s, v3.8h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v6.4s, v0.8h +; CHECK-NEXT: stp q4, q6, [x4] +; CHECK-NEXT: uaddw2 v3.4s, v7.4s, v2.8h +; CHECK-NEXT: stp q5, q7, [x4, #32] +; CHECK-NEXT: uaddw v0.4s, v4.4s, v0.4h +; CHECK-NEXT: uaddw v2.4s, v5.4s, v2.4h +; CHECK-NEXT: ret + %lp1 = load <4 x i8>, ptr %p + %p2 = getelementptr i8, ptr %p, i32 4 + %lp2 = load <4 x i8>, ptr %p2 + %p3 = getelementptr i8, ptr %p, i32 8 + %lp3 = load <4 x i8>, ptr %p3 + %p4 = getelementptr i8, ptr %p, i32 12 + %lp4 = load <4 x i8>, ptr %p4 + %lq1 = load <4 x i8>, ptr %q + %q2 = getelementptr i8, ptr %q, i32 4 + %lq2 = load <4 x i8>, ptr %q2 + %q3 = getelementptr i8, ptr %q, i32 8 + %lq3 = load <4 x i8>, ptr %q3 + %q4 = getelementptr i8, ptr %q, i32 12 + %lq4 = load <4 x i8>, ptr %q4 + %lr1 = load <4 x i8>, ptr %r + %r2 = getelementptr i8, ptr %r, i32 4 + %lr2 = load <4 x i8>, ptr %r2 + %r3 = getelementptr i8, ptr %r, i32 8 + %lr3 = load <4 x i8>, ptr %r3 + %r4 = getelementptr i8, ptr %r, i32 12 + %lr4 = load <4 x i8>, ptr %r4 + %ls1 = load <4 x i8>, ptr %s + %s2 = getelementptr i8, ptr %s, i32 4 + %ls2 = load <4 x i8>, ptr %s2 + %s3 = getelementptr i8, ptr %s, i32 8 + %ls3 = load <4 x i8>, ptr %s3 + %s4 = getelementptr i8, ptr %s, i32 12 + %ls4 = load <4 x i8>, ptr %s4 + + %jk1 = shufflevector <4 x i8> %lp1, <4 x i8> %lq1, <16 x i32> + %m1l = shufflevector <4 x i8> %lr1, <4 x i8> poison, <16 x i32> + %jkm1 = shufflevector <16 x i8> %jk1, <16 x i8> %m1l, <16 x i32> + %n1l = shufflevector <4 x i8> %ls1, <4 x i8> poison, <16 x i32> + %l1 = shufflevector <16 x i8> %jkm1, <16 x i8> %n1l, <16 x i32> + %jk2 = shufflevector <4 x i8> %lp2, <4 x i8> %lq2, <16 x i32> + %m2l = shufflevector <4 x i8> %lr2, <4 x i8> poison, <16 x i32> + %jkm2 = shufflevector <16 x i8> %jk2, <16 x i8> %m2l, <16 x i32> + %n2l = shufflevector <4 x i8> %ls2, <4 x i8> poison, <16 x i32> + %l2 = shufflevector <16 x i8> %jkm2, <16 x i8> %n2l, <16 x i32> + %jk3 = shufflevector <4 x i8> %lp3, <4 x i8> %lq3, <16 x i32> + %m3l = shufflevector <4 x i8> %lr3, <4 x i8> poison, <16 x i32> + %jkm3 = shufflevector <16 x i8> %jk3, <16 x i8> %m3l, <16 x i32> + %n3l = shufflevector <4 x i8> %ls3, <4 x i8> poison, <16 x i32> + %l3 = shufflevector <16 x i8> %jkm3, <16 x i8> %n3l, <16 x i32> + %jk4 = shufflevector <4 x i8> %lp4, <4 x i8> %lq4, <16 x i32> + %m4l = shufflevector <4 x i8> %lr4, <4 x i8> poison, <16 x i32> + %jkm4 = shufflevector <16 x i8> %jk4, <16 x i8> %m4l, <16 x i32> + %n4l = shufflevector <4 x i8> %ls4, <4 x i8> poison, <16 x i32> + %l4 = shufflevector <16 x i8> %jkm4, <16 x i8> %n4l, <16 x i32> + + %le11 = zext <16 x i8> %l1 to <16 x i16> + %le12 = zext <16 x i8> %l3 to <16 x i16> + %le21 = zext <16 x i8> %l2 to <16 x i16> + %le22 = zext <16 x i8> %l4 to <16 x i16> + %la1 = add <16 x i16> %le11, %le12 + %la2 = add <16 x i16> %le21, %le22 + %e1 = zext <16 x i16> %la1 to <16 x i32> + %e2 = zext <16 x i16> %la2 to <16 x i32> + %se2 = shl <16 x i32> %e2, + store <16 x i32> %se2, ptr %z + %a = add <16 x i32> %e1, %se2 + ret <16 x i32> %a +} + + +define <8 x i32> @commuted_loads(ptr %p1, ptr %p2) { +; CHECK-LABEL: commuted_loads: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldp d3, d2, [x1] +; CHECK-NEXT: add v0.8b, v3.8b, v0.8b +; CHECK-NEXT: add v1.8b, v2.8b, v1.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 +; CHECK-NEXT: ushll v3.4s, v1.4h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v2.4s, v0.8h +; CHECK-NEXT: uaddw v0.4s, v3.4s, v0.4h +; CHECK-NEXT: ret + %l11 = load <8 x i8>, ptr %p1 + %q1 = getelementptr i8, ptr %p1, i32 8 + %l12 = load <8 x i8>, ptr %q1 + %l21 = load <8 x i8>, ptr %p2 + %q2 = getelementptr i8, ptr %p2, i32 8 + %l22 = load <8 x i8>, ptr %q2 + %l1 = add <8 x i8> %l21, %l11 + %l2 = add <8 x i8> %l22, %l12 + %e1 = zext <8 x i8> %l1 to <8 x i32> + %e2 = zext <8 x i8> %l2 to <8 x i32> + %se2 = shl <8 x i32> %e2, + %a = add <8 x i32> %e1, %se2 + ret <8 x i32> %a +} + +define <8 x i32> @commuted_loads2(ptr %p1, ptr %p2) { +; CHECK-LABEL: commuted_loads2: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldp d2, d3, [x1] +; CHECK-NEXT: add v0.8b, v0.8b, v2.8b +; CHECK-NEXT: add v1.8b, v1.8b, v3.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v2.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v1.4s, v0.8h, #3 +; CHECK-NEXT: ushll v0.4s, v0.4h, #3 +; CHECK-NEXT: uaddw2 v1.4s, v1.4s, v2.8h +; CHECK-NEXT: uaddw v0.4s, v0.4s, v2.4h +; CHECK-NEXT: ret + %l11 = load <8 x i8>, ptr %p1 + %q1 = getelementptr i8, ptr %p1, i32 8 + %l12 = load <8 x i8>, ptr %q1 + %l21 = load <8 x i8>, ptr %p2 + %q2 = getelementptr i8, ptr %p2, i32 8 + %l22 = load <8 x i8>, ptr %q2 + %l1 = add <8 x i8> %l11, %l21 + %l2 = add <8 x i8> %l12, %l22 + %e1 = zext <8 x i8> %l2 to <8 x i32> + %e2 = zext <8 x i8> %l1 to <8 x i32> + %se2 = shl <8 x i32> %e2, + %a = add <8 x i32> %e1, %se2 + ret <8 x i32> %a +} + +define <8 x i32> @commuted_sub(ptr %p1, ptr %p2) { +; CHECK-LABEL: commuted_sub: +; CHECK: // %bb.0: +; CHECK-NEXT: ldp d0, d1, [x0] +; CHECK-NEXT: ldp d3, d2, [x1] +; CHECK-NEXT: add v0.8b, v0.8b, v3.8b +; CHECK-NEXT: add v1.8b, v1.8b, v2.8b +; CHECK-NEXT: ushll v0.8h, v0.8b, #0 +; CHECK-NEXT: ushll v1.8h, v1.8b, #0 +; CHECK-NEXT: ushll2 v2.4s, v1.8h, #3 +; CHECK-NEXT: ushll v3.4s, v1.4h, #3 +; CHECK-NEXT: usubw2 v1.4s, v2.4s, v0.8h +; CHECK-NEXT: usubw v0.4s, v3.4s, v0.4h +; CHECK-NEXT: ret + %l11 = load <8 x i8>, ptr %p1 + %q1 = getelementptr i8, ptr %p1, i32 8 + %l12 = load <8 x i8>, ptr %q1 + %l21 = load <8 x i8>, ptr %p2 + %q2 = getelementptr i8, ptr %p2, i32 8 + %l22 = load <8 x i8>, ptr %q2 + %l1 = add <8 x i8> %l11, %l21 + %l2 = add <8 x i8> %l12, %l22 + %e1 = zext <8 x i8> %l1 to <8 x i32> + %e2 = zext <8 x i8> %l2 to <8 x i32> + %se2 = shl <8 x i32> %e2, + %a = sub <8 x i32> %se2, %e1 + ret <8 x i32> %a +} -- 2.7.4