From 549db744bde29c8331411a4b41607a33c363c108 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 21 Nov 2019 14:06:54 +0000 Subject: [PATCH] [ARM] Lots of MVE offset masked load and store tests. NFC --- llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll | 2646 +++++++++++++++++++ .../test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll | 2726 ++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll | 2726 ++++++++++++++++++++ llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll | 118 +- 4 files changed, 8157 insertions(+), 59 deletions(-) create mode 100644 llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll create mode 100644 llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll new file mode 100644 index 0000000..ba3ef58 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-offset.ll @@ -0,0 +1,2646 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r3, r0, #3 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r3, r0, #2 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #512 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #512 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %x +} + +define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r3, r0, #3 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #254] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #256 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #-254] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #256 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r3, r0, #3 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #254] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #256 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #-254] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #256 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #4] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: adds r3, r0, #3 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #2] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #254] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #256 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #256 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %x +} + +define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #127] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #128 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #128 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #2] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #127] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #128 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #128 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %x +} + +define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #4] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #2] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #127] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #128 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #128 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #4] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #2] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #127] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #128 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #128 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %x +} + +define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #4] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %x +} + +define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #3] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %x +} + +define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #2] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %x +} + +define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #127] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %x +} + +define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #128 +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r3] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %x +} + +define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %x +} + +define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #128 +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r3] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %x +} + +define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r3, r0, #3 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adds r3, r0, #2 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #512 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %x +} + +define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #512 +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %x +} + +define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #4] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: adds r3, r0, #3 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #2] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #254] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: add.w r3, r0, #256 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %x +} + +define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: sub.w r3, r0, #256 +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r3] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %x +} + + + + +define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #508] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #512 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #512 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #2] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #254] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #256 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #-254] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #256 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %y +} + +define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #2] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #254] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #256 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #-254] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #256 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %y +} + +define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #3] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %y +} + +define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #2] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %y +} + +define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #127] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %y +} + +define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #128 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %y +} + +define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #-127] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %y +} + +define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #128 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %y +} + +define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #3] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #2] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #127] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #128 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #-127] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #128 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %y +} + +define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %y +} + +define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #3] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %y +} + +define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #2] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %y +} + +define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #127] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %y +} + +define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #128 +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %y +} + +define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #-127] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %y +} + +define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #128 +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %y +} + +define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #508] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #512 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #512 +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %y +} + +define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #4] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: adds r1, r0, #3 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #2] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #254] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: add.w r1, r0, #256 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #-254] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: sub.w r1, r0, #256 +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %y +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v4i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll new file mode 100644 index 0000000..69286c8 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-postinc.ll @@ -0,0 +1,2726 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + + + + +define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %y to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %y to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %y to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %y to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %y to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %y to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %y to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %y to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %y to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %y to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %y to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %y to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %y to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %y to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %y to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %y to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %y to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %y to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %y to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %y to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %y to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %y to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %y to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %y to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %y to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %y to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %y to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %y to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %y to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %y to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %y to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %y to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %y to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %y to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %y to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %y to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %y to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %y to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %y to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %y to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %y to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %y to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %y to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %y to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll new file mode 100644 index 0000000..2874469 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst-preinc.ll @@ -0,0 +1,2726 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE +; RUN: llc -mtriple=thumbebv8.1m.main-arm-none-eabi -mattr=+mve -enable-arm-maskedldst -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE + +define i8* @ldrwu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_m508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwu32_m512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwu32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %z to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %c, <4 x i32> undef) + %2 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %1, <4 x i32>* %2, align 4 + ret i8* %z +} + +define i8* @ldrhu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #254] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_m254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0, #-254] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu32_m256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhu32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = zext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #254] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_m254(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0, #-254] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhs32_m256(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrhs32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %c, <4 x i16> undef) + %2 = sext <4 x i16> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrhu16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #254] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_m254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhu16_m256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhu16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %c, <8 x i16> undef) + %2 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %1, <8 x i16>* %2, align 2 + ret i8* %z +} + +define i8* @ldrbu32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_m127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu32_m128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbu32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = zext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_m127(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbs32_m128(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrbs32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %c, <4 x i8> undef) + %2 = sext <4 x i8> %1 to <4 x i32> + %3 = bitcast i8* %y to <4 x i32>* + store <4 x i32> %2, <4 x i32>* %3, align 4 + ret i8* %z +} + +define i8* @ldrbu16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_m127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu16_m128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbu16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = zext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_m127(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbs16_m128(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrbs16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %c, <8 x i8> undef) + %2 = sext <8 x i8> %1 to <8 x i16> + %3 = bitcast i8* %y to <8 x i16>* + store <8 x i16> %2, <8 x i16>* %3, align 2 + ret i8* %z +} + +define i8* @ldrbu8_4(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_3(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_2(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_127(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 127 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_128(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 128 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_m127(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -127 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrbu8_m128(i8* %x, i8* %y, <16 x i8> *%m) { +; CHECK-LABEL: ldrbu8_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrb.u8 q0, [r2] +; CHECK-NEXT: vpt.i8 ne, q0, zr +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: vstrb.8 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -128 + %0 = bitcast i8* %z to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %c, <16 x i8> undef) + %2 = bitcast i8* %y to <16 x i8>* + store <16 x i8> %1, <16 x i8>* %2, align 1 + ret i8* %z +} + +define i8* @ldrwf32_4(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_3(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_2(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #508] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 508 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 512 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_m508(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0, #-508] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -508 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrwf32_m512(i8* %x, i8* %y, <4 x i32> *%m) { +; CHECK-LABEL: ldrwf32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vpt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -512 + %0 = bitcast i8* %z to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %c, <4 x float> undef) + %2 = bitcast i8* %y to <4 x float>* + store <4 x float> %1, <4 x float>* %2, align 4 + ret i8* %z +} + +define i8* @ldrhf16_4(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 4 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_3(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 3 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_2(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 2 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #254] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 254 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 256 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_m254(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, #-254] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -254 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + +define i8* @ldrhf16_m256(i8* %x, i8* %y, <8 x i16> *%m) { +; CHECK-LABEL: ldrhf16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r2] +; CHECK-NEXT: vpt.i16 ne, q0, zr +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: vstrh.16 q0, [r1] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %x, i32 -256 + %0 = bitcast i8* %z to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %c, <8 x half> undef) + %2 = bitcast i8* %y to <8 x half>* + store <8 x half> %1, <8 x half>* %2, align 2 + ret i8* %z +} + + + + +define i8* @strw32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #508] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_m508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strw32_m512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strw32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x i32>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + %2 = bitcast i8* %z to <4 x i32>* + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_254(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #254] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_256(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_m254(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0, #-254] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh32_m256(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strh32_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <4 x i16>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = bitcast i8* %z to <4 x i16>* + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %1, <4 x i16>* %2, i32 2, <4 x i1> %c) + ret i8* %z +} + +define i8* @strh16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #254] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_m254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #-254] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strh16_m256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strh16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <8 x i16>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + %2 = bitcast i8* %z to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %1, <8 x i16>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_127(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_128(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_m127(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb32_m128(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strb32_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <4 x i8>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = bitcast i8* %z to <4 x i8>* + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %1, <4 x i8>* %2, i32 1, <4 x i1> %c) + ret i8* %z +} + +define i8* @strb16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_127(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_128(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_m127(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb16_m128(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strb16_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <8 x i8>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = bitcast i8* %z to <8 x i8>* + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %1, <8 x i8>* %2, i32 1, <8 x i1> %c) + ret i8* %z +} + +define i8* @strb8_4(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_3(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #3] +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_2(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_127(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #127] +; CHECK-NEXT: adds r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_128(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #128 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_m127(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_m127: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0, #-127] +; CHECK-NEXT: subs r0, #127 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -127 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strb8_m128(i8* %y, i8* %x, <16 x i8> *%m) { +; CHECK-LABEL: strb8_m128: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r0, #128 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vldrb.u8 q1, [r2] +; CHECK-NEXT: vpt.i8 ne, q1, zr +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -128 + %0 = bitcast i8* %x to <16 x i8>* + %mask = load <16 x i8>, <16 x i8>* %m, align 1 + %c = icmp ne <16 x i8> %mask, zeroinitializer + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + %2 = bitcast i8* %z to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %1, <16 x i8>* %2, i32 1, <16 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_4(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_3(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_2(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #508] +; CHECK-NEXT: add.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_m508(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_m508: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0, #-508] +; CHECK-NEXT: sub.w r0, r0, #508 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -508 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strwf32_m512(i8* %y, i8* %x, <4 x i32> *%m) { +; CHECK-LABEL: strwf32_m512: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #512 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: vpt.i32 ne, q1, zr +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -512 + %0 = bitcast i8* %x to <4 x float>* + %mask = load <4 x i32>, <4 x i32>* %m, align 4 + %c = icmp ne <4 x i32> %mask, zeroinitializer + %1 = load <4 x float>, <4 x float>* %0, align 4 + %2 = bitcast i8* %z to <4 x float>* + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %1, <4 x float>* %2, i32 4, <4 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_4(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #4] +; CHECK-NEXT: adds r0, #4 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 4 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_3(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r0, #3 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 3 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_2(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #2] +; CHECK-NEXT: adds r0, #2 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 2 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #254] +; CHECK-NEXT: adds r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: add.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_m254(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_m254: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, #-254] +; CHECK-NEXT: subs r0, #254 +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -254 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +define i8* @strhf16_m256(i8* %y, i8* %x, <8 x i16> *%m) { +; CHECK-LABEL: strhf16_m256: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub.w r0, r0, #256 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vpt.i16 ne, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %z = getelementptr inbounds i8, i8* %y, i32 -256 + %0 = bitcast i8* %x to <8 x half>* + %mask = load <8 x i16>, <8 x i16>* %m, align 2 + %c = icmp ne <8 x i16> %mask, zeroinitializer + %1 = load <8 x half>, <8 x half>* %0, align 2 + %2 = bitcast i8* %z to <8 x half>* + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %1, <8 x half>* %2, i32 2, <8 x i1> %c) + ret i8* %z +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll index 46b64c8..100a082 100644 --- a/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-ldst.ll @@ -13,8 +13,8 @@ define void @foo_v4i32_v4i32(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i32> *%src entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) - call void @llvm.masked.store.v4i32(<4 x i32> %2, <4 x i32>* %dest, i32 4, <4 x i1> %1) + %2 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %dest, i32 4, <4 x i1> %1) ret void } @@ -29,9 +29,9 @@ define void @foo_sext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *% entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef) + %2 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = sext <4 x i8> %2 to <4 x i32> - call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) ret void } @@ -46,9 +46,9 @@ define void @foo_sext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef) + %2 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = sext <4 x i16> %2 to <4 x i32> - call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) ret void } @@ -63,9 +63,9 @@ define void @foo_zext_v4i32_v4i8(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i8> *% entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x i8> @llvm.masked.load.v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef) + %2 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %src, i32 1, <4 x i1> %1, <4 x i8> undef) %3 = zext <4 x i8> %2 to <4 x i32> - call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) ret void } @@ -80,9 +80,9 @@ define void @foo_zext_v4i32_v4i16(<4 x i32> *%dest, <4 x i32> *%mask, <4 x i16> entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x i16> @llvm.masked.load.v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef) + %2 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %src, i32 2, <4 x i1> %1, <4 x i16> undef) %3 = zext <4 x i16> %2 to <4 x i32> - call void @llvm.masked.store.v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %3, <4 x i32>* %dest, i32 4, <4 x i1> %1) ret void } @@ -234,9 +234,9 @@ define void @foo_sext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> entry: %0 = load <2 x i32>, <2 x i32>* %mask, align 4 %1 = icmp sgt <2 x i32> %0, zeroinitializer - %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef) + %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef) %3 = sext <2 x i32> %2 to <2 x i64> - call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1) + call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1) ret void } @@ -392,9 +392,9 @@ define void @foo_sext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, entry: %0 = load <2 x i32>, <2 x i32>* %mask, align 4 %1 = icmp sgt <2 x i32> %0, zeroinitializer - %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef) + %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef) %3 = sext <2 x i32> %2 to <2 x i64> - call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1) + call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1) ret void } @@ -549,9 +549,9 @@ define void @foo_zext_v2i64_v2i32(<2 x i64> *%dest, <2 x i32> *%mask, <2 x i32> entry: %0 = load <2 x i32>, <2 x i32>* %mask, align 4 %1 = icmp sgt <2 x i32> %0, zeroinitializer - %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef) + %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 4, <2 x i1> %1, <2 x i32> undef) %3 = zext <2 x i32> %2 to <2 x i64> - call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1) + call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 8, <2 x i1> %1) ret void } @@ -710,9 +710,9 @@ define void @foo_zext_v2i64_v2i32_unaligned(<2 x i64> *%dest, <2 x i32> *%mask, entry: %0 = load <2 x i32>, <2 x i32>* %mask, align 4 %1 = icmp sgt <2 x i32> %0, zeroinitializer - %2 = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef) + %2 = call <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>* %src, i32 2, <2 x i1> %1, <2 x i32> undef) %3 = zext <2 x i32> %2 to <2 x i64> - call void @llvm.masked.store.v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1) + call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %3, <2 x i64>* %dest, i32 4, <2 x i1> %1) ret void } @@ -727,8 +727,8 @@ define void @foo_v8i16_v8i16(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i16> *%src entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer - %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef) - call void @llvm.masked.store.v8i16(<8 x i16> %2, <8 x i16>* %dest, i32 2, <8 x i1> %1) + %2 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %2, <8 x i16>* %dest, i32 2, <8 x i1> %1) ret void } @@ -743,9 +743,9 @@ define void @foo_sext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *% entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer - %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef) + %2 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = sext <8 x i8> %2 to <8 x i16> - call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1) ret void } @@ -760,9 +760,9 @@ define void @foo_zext_v8i16_v8i8(<8 x i16> *%dest, <8 x i16> *%mask, <8 x i8> *% entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer - %2 = call <8 x i8> @llvm.masked.load.v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef) + %2 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %src, i32 1, <8 x i1> %1, <8 x i8> undef) %3 = zext <8 x i8> %2 to <8 x i16> - call void @llvm.masked.store.v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %3, <8 x i16>* %dest, i32 2, <8 x i1> %1) ret void } @@ -777,8 +777,8 @@ define void @foo_v16i8_v16i8(<16 x i8> *%dest, <16 x i8> *%mask, <16 x i8> *%src entry: %0 = load <16 x i8>, <16 x i8>* %mask, align 1 %1 = icmp sgt <16 x i8> %0, zeroinitializer - %2 = call <16 x i8> @llvm.masked.load.v16i8(<16 x i8>* %src, i32 1, <16 x i1> %1, <16 x i8> undef) - call void @llvm.masked.store.v16i8(<16 x i8> %2, <16 x i8>* %dest, i32 1, <16 x i1> %1) + %2 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %src, i32 1, <16 x i1> %1, <16 x i8> undef) + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %2, <16 x i8>* %dest, i32 1, <16 x i1> %1) ret void } @@ -793,9 +793,9 @@ define void @foo_trunc_v8i8_v8i16(<8 x i8> *%dest, <8 x i16> *%mask, <8 x i16> * entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer - %2 = call <8 x i16> @llvm.masked.load.v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef) + %2 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %src, i32 2, <8 x i1> %1, <8 x i16> undef) %3 = trunc <8 x i16> %2 to <8 x i8> - call void @llvm.masked.store.v8i8(<8 x i8> %3, <8 x i8>* %dest, i32 1, <8 x i1> %1) + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %3, <8 x i8>* %dest, i32 1, <8 x i1> %1) ret void } @@ -810,9 +810,9 @@ define void @foo_trunc_v4i8_v4i32(<4 x i8> *%dest, <4 x i32> *%mask, <4 x i32> * entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = trunc <4 x i32> %2 to <4 x i8> - call void @llvm.masked.store.v4i8(<4 x i8> %3, <4 x i8>* %dest, i32 1, <4 x i1> %1) + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %3, <4 x i8>* %dest, i32 1, <4 x i1> %1) ret void } @@ -827,9 +827,9 @@ define void @foo_trunc_v4i16_v4i32(<4 x i16> *%dest, <4 x i32> *%mask, <4 x i32> entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x i32> @llvm.masked.load.v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) + %2 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %src, i32 4, <4 x i1> %1, <4 x i32> undef) %3 = trunc <4 x i32> %2 to <4 x i16> - call void @llvm.masked.store.v4i16(<4 x i16> %3, <4 x i16>* %dest, i32 2, <4 x i1> %1) + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %3, <4 x i16>* %dest, i32 2, <4 x i1> %1) ret void } @@ -844,8 +844,8 @@ define void @foo_v4f32_v4f32(<4 x float> *%dest, <4 x i32> *%mask, <4 x float> * entry: %0 = load <4 x i32>, <4 x i32>* %mask, align 4 %1 = icmp sgt <4 x i32> %0, zeroinitializer - %2 = call <4 x float> @llvm.masked.load.v4f32(<4 x float>* %src, i32 4, <4 x i1> %1, <4 x float> undef) - call void @llvm.masked.store.v4f32(<4 x float> %2, <4 x float>* %dest, i32 4, <4 x i1> %1) + %2 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %src, i32 4, <4 x i1> %1, <4 x float> undef) + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %2, <4 x float>* %dest, i32 4, <4 x i1> %1) ret void } @@ -860,8 +860,8 @@ define void @foo_v8f16_v8f16(<8 x half> *%dest, <8 x i16> *%mask, <8 x half> *%s entry: %0 = load <8 x i16>, <8 x i16>* %mask, align 2 %1 = icmp sgt <8 x i16> %0, zeroinitializer - %2 = call <8 x half> @llvm.masked.load.v8f16(<8 x half>* %src, i32 2, <8 x i1> %1, <8 x half> undef) - call void @llvm.masked.store.v8f16(<8 x half> %2, <8 x half>* %dest, i32 2, <8 x i1> %1) + %2 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %src, i32 2, <8 x i1> %1, <8 x half> undef) + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %2, <8 x half>* %dest, i32 2, <8 x i1> %1) ret void } @@ -991,9 +991,9 @@ define void @foo_v4f32_v4f16(<4 x float> *%dest, <4 x i16> *%mask, <4 x half> *% entry: %0 = load <4 x i16>, <4 x i16>* %mask, align 2 %1 = icmp sgt <4 x i16> %0, zeroinitializer - %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef) + %2 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef) %3 = fpext <4 x half> %2 to <4 x float> - call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 2, <4 x i1> %1) + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %dest, i32 2, <4 x i1> %1) ret void } @@ -1123,29 +1123,29 @@ define void @foo_v4f32_v4f16_unaligned(<4 x float> *%dest, <4 x i16> *%mask, <4 entry: %0 = load <4 x i16>, <4 x i16>* %mask, align 2 %1 = icmp sgt <4 x i16> %0, zeroinitializer - %2 = call <4 x half> @llvm.masked.load.v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef) + %2 = call <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>* %src, i32 2, <4 x i1> %1, <4 x half> undef) %3 = fpext <4 x half> %2 to <4 x float> - call void @llvm.masked.store.v4f32(<4 x float> %3, <4 x float>* %dest, i32 1, <4 x i1> %1) + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %3, <4 x float>* %dest, i32 1, <4 x i1> %1) ret void } -declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) -declare void @llvm.masked.store.v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) -declare void @llvm.masked.store.v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) -declare void @llvm.masked.store.v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) -declare <16 x i8> @llvm.masked.load.v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) -declare <8 x i16> @llvm.masked.load.v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) -declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) -declare <4 x i32> @llvm.masked.load.v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) -declare <4 x float> @llvm.masked.load.v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) -declare <4 x half> @llvm.masked.load.v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>) -declare <8 x half> @llvm.masked.load.v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>) +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>) +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) +declare <2 x i32> @llvm.masked.load.v2i32.p0v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>) +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32, <4 x i1>, <4 x float>) +declare <4 x half> @llvm.masked.load.v4f16.p0v4f16(<4 x half>*, i32, <4 x i1>, <4 x half>) +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32, <8 x i1>, <8 x half>) -declare void @llvm.masked.store.v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) -declare void @llvm.masked.store.v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) -declare void @llvm.masked.store.v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) -declare void @llvm.masked.store.v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) -declare <4 x i16> @llvm.masked.load.v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) -declare <4 x i8> @llvm.masked.load.v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) -declare <8 x i8> @llvm.masked.load.v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32, <8 x i1>) +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32, <4 x i1>) +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32, <4 x i1>) +declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32, <2 x i1>) +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32, <4 x i1>, <4 x i16>) +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32, <8 x i1>, <8 x i8>) -- 2.7.4