return nullptr;
}
Offsets = GEP->getOperand(1);
- // SExt offsets inside masked gathers are not permitted by the architecture;
- // we therefore can't fold them
+ // Paranoid check whether the number of parallel lanes is the same
+ assert(Ty->getVectorNumElements() ==
+ Offsets->getType()->getVectorNumElements());
+ // Only <N x i32> offsets can be integrated into an arm gather, any smaller
+ // type would have to be sign extended by the gep - and arm gathers can only
+ // zero extend. Additionally, the offsets do have to originate from a zext of
+ // a vector with element types smaller or equal the type of the gather we're
+ // looking at
+ if (Offsets->getType()->getScalarSizeInBits() != 32)
+ return nullptr;
if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
Offsets = ZextOffs->getOperand(0);
- Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
- // If the offset we found does not have the type the intrinsic expects,
- // i.e., the same type as the gather (or scatter input) itself, we need to
- // convert it (only i types) or fall back to expanding the gather
- if (OffsType != Offsets->getType()) {
- if (OffsType->getScalarSizeInBits() >
- Offsets->getType()->getScalarSizeInBits()) {
- LLVM_DEBUG(dbgs() << "masked gathers/scatters: extending offsets\n");
- Offsets = Builder.CreateZExt(Offsets, OffsType, "");
- } else {
+ else if (!(Offsets->getType()->getVectorNumElements() == 4 &&
+ Offsets->getType()->getScalarSizeInBits() == 32))
+ return nullptr;
+
+ if (Ty != Offsets->getType()) {
+ if ((Ty->getScalarSizeInBits() <
+ Offsets->getType()->getScalarSizeInBits())) {
LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
<< " Can't create intrinsic.\n");
return nullptr;
+ } else {
+ Offsets = Builder.CreateZExt(
+ Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
}
}
// If none of the checks failed, return the gep's base pointer
+ LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n");
return GEPPtr;
}
ret <8 x i16> %gather.zext
}
+define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16_noext(i8* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: zext_unscaled_i8_i16_noext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: vldrb.s32 q0, [r1]
+; CHECK-NEXT: vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r2, s3
+; CHECK-NEXT: vmov r3, s1
+; CHECK-NEXT: vmov r5, s0
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: vmov r4, s7
+; CHECK-NEXT: ldrb.w r12, [r2]
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: ldrb.w lr, [r3]
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: ldrb r0, [r0]
+; CHECK-NEXT: vmov.16 q0[0], r5
+; CHECK-NEXT: ldrb r1, [r1]
+; CHECK-NEXT: vmov.16 q0[1], lr
+; CHECK-NEXT: ldrb r4, [r4]
+; CHECK-NEXT: ldrb r2, [r2]
+; CHECK-NEXT: ldrb r3, [r3]
+; CHECK-NEXT: vmov.16 q0[2], r2
+; CHECK-NEXT: vmov.16 q0[3], r12
+; CHECK-NEXT: vmov.16 q0[4], r0
+; CHECK-NEXT: vmov.16 q0[5], r1
+; CHECK-NEXT: vmov.16 q0[6], r3
+; CHECK-NEXT: vmov.16 q0[7], r4
+; CHECK-NEXT: vmovlb.u8 q0, q0
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+ %ptrs = getelementptr inbounds i8, i8* %base, <8 x i8> %offs
+ %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+ %gather.zext = zext <8 x i8> %gather to <8 x i16>
+ ret <8 x i16> %gather.zext
+}
+
+define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_sext(i16* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: scaled_v8i16_sext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: vldrb.s32 q0, [r1]
+; CHECK-NEXT: vldrb.s32 q1, [r1, #4]
+; CHECK-NEXT: vshl.i32 q0, q0, #1
+; CHECK-NEXT: vshl.i32 q1, q1, #1
+; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s3
+; CHECK-NEXT: vmov r5, s1
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: vmov r4, s7
+; CHECK-NEXT: ldrh.w r12, [r2]
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: ldrh.w lr, [r3]
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: ldrh r5, [r5]
+; CHECK-NEXT: ldrh r0, [r0]
+; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r3, [r3]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: vmov.16 q0[1], r5
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r0
+; CHECK-NEXT: vmov.16 q0[5], r1
+; CHECK-NEXT: vmov.16 q0[6], r3
+; CHECK-NEXT: vmov.16 q0[7], r4
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+ %offs.sext = sext <8 x i8> %offs to <8 x i16>
+ %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.sext
+ %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
+ ret <8 x i16> %gather
+}
+
+define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_zext(i16* %base, <8 x i8>* %offptr) {
+; CHECK-LABEL: scaled_v8i16_zext:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r7, lr}
+; CHECK-NEXT: push {r4, r5, r7, lr}
+; CHECK-NEXT: vldrb.u32 q0, [r1]
+; CHECK-NEXT: vldrb.u32 q1, [r1, #4]
+; CHECK-NEXT: vshl.i32 q0, q0, #1
+; CHECK-NEXT: vshl.i32 q1, q1, #1
+; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vadd.i32 q1, q1, r0
+; CHECK-NEXT: vmov r2, s2
+; CHECK-NEXT: vmov r3, s3
+; CHECK-NEXT: vmov r5, s1
+; CHECK-NEXT: vmov r0, s4
+; CHECK-NEXT: vmov r1, s5
+; CHECK-NEXT: vmov r4, s7
+; CHECK-NEXT: ldrh.w r12, [r2]
+; CHECK-NEXT: vmov r2, s0
+; CHECK-NEXT: ldrh.w lr, [r3]
+; CHECK-NEXT: vmov r3, s6
+; CHECK-NEXT: ldrh r5, [r5]
+; CHECK-NEXT: ldrh r0, [r0]
+; CHECK-NEXT: ldrh r1, [r1]
+; CHECK-NEXT: ldrh r4, [r4]
+; CHECK-NEXT: ldrh r2, [r2]
+; CHECK-NEXT: ldrh r3, [r3]
+; CHECK-NEXT: vmov.16 q0[0], r2
+; CHECK-NEXT: vmov.16 q0[1], r5
+; CHECK-NEXT: vmov.16 q0[2], r12
+; CHECK-NEXT: vmov.16 q0[3], lr
+; CHECK-NEXT: vmov.16 q0[4], r0
+; CHECK-NEXT: vmov.16 q0[5], r1
+; CHECK-NEXT: vmov.16 q0[6], r3
+; CHECK-NEXT: vmov.16 q0[7], r4
+; CHECK-NEXT: pop {r4, r5, r7, pc}
+entry:
+ %offs = load <8 x i8>, <8 x i8>* %offptr, align 2
+ %offs.zext = zext <8 x i8> %offs to <8 x i16>
+ %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs.zext
+ %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> undef)
+ ret <8 x i16> %gather
+}
+
define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) {
; CHECK-LABEL: sext_unscaled_i8_i16:
; CHECK: @ %bb.0: @ %entry