From 394974111b1781a2f93e502082cecad1b3aff6ee Mon Sep 17 00:00:00 2001
From: Anna Welker <anna.welker@arm.com>
Date: Mon, 2 Mar 2020 09:14:37 +0000
Subject: [PATCH] [ARM][MVE] Restrict allowed types of gather/scatter offsets

The MVE gather instructions smaller than 32bits zext extend the values
in the offset register, as opposed to sign extending them. We need to
make sure that the code that we select from is suitably extended, which
this patch attempts to fix by tightening up the offset checks.

Differential Revision: https://reviews.llvm.org/D75361
---
 llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp   |  33 ++++--
 .../CodeGen/Thumb2/mve-gather-ind16-unscaled.ll    | 131 +++++++++++++++++++++
 2 files changed, 152 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 84f9dc0..b90440b 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -145,26 +145,35 @@ Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
     return nullptr;
   }
   Offsets = GEP->getOperand(1);
-  // SExt offsets inside masked gathers are not permitted by the architecture;
-  // we therefore can't fold them
+  // Paranoid check whether the number of parallel lanes is the same
+  assert(Ty->getVectorNumElements() ==
+         Offsets->getType()->getVectorNumElements());
+  // Only <N x i32> offsets can be integrated into an arm gather, any smaller
+  // type would have to be sign extended by the gep - and arm gathers can only
+  // zero extend. Additionally, the offsets do have to originate from a zext of
+  // a vector with element types smaller or equal the type of the gather we're
+  // looking at
+  if (Offsets->getType()->getScalarSizeInBits() != 32)
+    return nullptr;
   if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
     Offsets = ZextOffs->getOperand(0);
-  Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
-  // If the offset we found does not have the type the intrinsic expects,
-  // i.e., the same type as the gather (or scatter input) itself, we need to
-  // convert it (only i types) or fall back to expanding the gather
-  if (OffsType != Offsets->getType()) {
-    if (OffsType->getScalarSizeInBits() >
-        Offsets->getType()->getScalarSizeInBits()) {
-      LLVM_DEBUG(dbgs() << "masked gathers/scatters: extending offsets\n");
-      Offsets = Builder.CreateZExt(Offsets, OffsType, "");
-    } else {
+  else if (!(Offsets->getType()->getVectorNumElements() == 4 &&
+             Offsets->getType()->getScalarSizeInBits() == 32))
+    return nullptr;
+
+  if (Ty != Offsets->getType()) {
+    if ((Ty->getScalarSizeInBits() <
+         Offsets->getType()->getScalarSizeInBits())) {
       LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
                         << " Can't create intrinsic.\n");
       return nullptr;
+    } else {
+      Offsets = Builder.CreateZExt(
+          Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
     }
   }
   // If none of the checks failed, return the gep's base pointer
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n");
   return GEPPtr;
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
index 7b90857..ee26676 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll
@@ -16,6 +16,137 @@ entry:
   ret <8 x i16> %gather.zext
 }
 
+define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(i8* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: zext_unscaled_i8_i16_noext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.s32 q0, [r1]
+; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, s3
+; CHECK-NEXT:    vmov r3, s1
+; CHECK-NEXT:    vmov r5, s0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    ldrb.w r12, [r2]
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    ldrb.w lr, [r3]
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.16 q0[0], r5
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    vmov.16 q0[1], lr
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[2], r2
+; CHECK-NEXT:    vmov.16 q0[3], r12
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    vmovlb.u8 q0, q0
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+  %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %offs
+  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+  %gather.zext = zext <8 x i8> %gather to <8 x i16>
+  ret <8 x i16> %gather.zext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: scaled_v8i16_sext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.s32 q0, [r1]
+; CHECK-NEXT:    vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    ldrh.w r12, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    ldrh.w lr, [r3]
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r12
+; CHECK-NEXT:    vmov.16 q0[3], lr
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+  %offs.sext = sext <8 x i8> %offs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.sext
+  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
+  ret <8 x i16> %gather
+}
+
+define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(i16* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: scaled_v8i16_zext:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    vldrb.u32 q0, [r1]
+; CHECK-NEXT:    vldrb.u32 q1, [r1, #4]
+; CHECK-NEXT:    vshl.i32 q0, q0, #1
+; CHECK-NEXT:    vshl.i32 q1, q1, #1
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vadd.i32 q1, q1, r0
+; CHECK-NEXT:    vmov r2, s2
+; CHECK-NEXT:    vmov r3, s3
+; CHECK-NEXT:    vmov r5, s1
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov r1, s5
+; CHECK-NEXT:    vmov r4, s7
+; CHECK-NEXT:    ldrh.w r12, [r2]
+; CHECK-NEXT:    vmov r2, s0
+; CHECK-NEXT:    ldrh.w lr, [r3]
+; CHECK-NEXT:    vmov r3, s6
+; CHECK-NEXT:    ldrh r5, [r5]
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r4, [r4]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov.16 q0[0], r2
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    vmov.16 q0[2], r12
+; CHECK-NEXT:    vmov.16 q0[3], lr
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov.16 q0[6], r3
+; CHECK-NEXT:    vmov.16 q0[7], r4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+entry:
+  %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+  %offs.zext = zext <8 x i8> %offs to <8 x i16>
+  %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.zext
+  %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
+  ret <8 x i16> %gather
+}
+
 define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
 ; CHECK-LABEL: sext_unscaled_i8_i16:
 ; CHECK:       @ %bb.0: @ %entry
-- 
2.7.4