Allow scalar_to_vector nodes to be used for the start of a build_vector creation
return UpdateBuildVector(Ops);
}
+ if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR && InVec.hasOneUse()) {
+ Ops.push_back(InVec.getOperand(0));
+ Ops.append(NumElts - 1, DAG.getUNDEF(InVec.getOperand(0).getValueType()));
+ return UpdateBuildVector(Ops);
+ }
+
if (InVec.isUndef()) {
Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType()));
return UpdateBuildVector(Ops);
;
; AIX-P8-32-LABEL: test_f2:
; AIX-P8-32: # %bb.0:
-; AIX-P8-32-NEXT: lwz r6, L..C0(r2) # %const.0
-; AIX-P8-32-NEXT: li r7, 4
+; AIX-P8-32-NEXT: li r6, 4
; AIX-P8-32-NEXT: lxsiwzx v3, 0, r3
-; AIX-P8-32-NEXT: lxsiwzx v0, 0, r4
-; AIX-P8-32-NEXT: lxsiwzx v2, r3, r7
-; AIX-P8-32-NEXT: lxsiwzx v5, r4, r7
-; AIX-P8-32-NEXT: lxvw4x v4, 0, r6
-; AIX-P8-32-NEXT: vperm v2, v3, v2, v4
-; AIX-P8-32-NEXT: vperm v3, v0, v5, v4
+; AIX-P8-32-NEXT: lxsiwzx v5, 0, r4
+; AIX-P8-32-NEXT: lxsiwzx v2, r3, r6
+; AIX-P8-32-NEXT: lxsiwzx v4, r4, r6
+; AIX-P8-32-NEXT: vmrgow v2, v3, v2
+; AIX-P8-32-NEXT: vmrgow v3, v5, v4
; AIX-P8-32-NEXT: xvaddsp vs0, v2, v3
; AIX-P8-32-NEXT: xxsldwi vs1, vs0, vs0, 1
; AIX-P8-32-NEXT: xscvspdpn f0, vs0
;
; AIX-P9-32-LABEL: test_f2:
; AIX-P9-32: # %bb.0:
-; AIX-P9-32-NEXT: lfiwzx f0, 0, r3
-; AIX-P9-32-NEXT: lwz r3, 4(r3)
-; AIX-P9-32-NEXT: xxsldwi vs0, f0, f0, 1
-; AIX-P9-32-NEXT: mtfprwz f1, r3
-; AIX-P9-32-NEXT: lwz r3, 4(r4)
-; AIX-P9-32-NEXT: xxinsertw vs0, vs1, 4
-; AIX-P9-32-NEXT: lfiwzx f1, 0, r4
-; AIX-P9-32-NEXT: mtfprwz f2, r3
-; AIX-P9-32-NEXT: xxsldwi vs1, f1, f1, 1
-; AIX-P9-32-NEXT: xxinsertw vs1, vs2, 4
-; AIX-P9-32-NEXT: xvaddsp vs0, vs0, vs1
+; AIX-P9-32-NEXT: li r6, 4
+; AIX-P9-32-NEXT: lxsiwzx v3, 0, r3
+; AIX-P9-32-NEXT: lxsiwzx v4, 0, r4
+; AIX-P9-32-NEXT: lxsiwzx v2, r3, r6
+; AIX-P9-32-NEXT: vmrgow v2, v3, v2
+; AIX-P9-32-NEXT: lxsiwzx v3, r4, r6
+; AIX-P9-32-NEXT: vmrgow v3, v4, v3
+; AIX-P9-32-NEXT: xvaddsp vs0, v2, v3
; AIX-P9-32-NEXT: xscvspdpn f1, vs0
; AIX-P9-32-NEXT: xxsldwi vs0, vs0, vs0, 1
; AIX-P9-32-NEXT: xscvspdpn f0, vs0
;
; P8-AIX-32-LABEL: testmrglb3:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r5, 4(r3)
-; P8-AIX-32-NEXT: lwz r4, L..C0(r2) # %const.0
-; P8-AIX-32-NEXT: stw r5, -32(r1)
-; P8-AIX-32-NEXT: lwz r3, 0(r3)
-; P8-AIX-32-NEXT: lxvw4x v2, 0, r4
-; P8-AIX-32-NEXT: addi r4, r1, -16
-; P8-AIX-32-NEXT: stw r3, -16(r1)
-; P8-AIX-32-NEXT: addi r3, r1, -32
-; P8-AIX-32-NEXT: lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT: lxvw4x v4, 0, r4
-; P8-AIX-32-NEXT: vperm v2, v4, v3, v2
+; P8-AIX-32-NEXT: lwz r4, 4(r3)
; P8-AIX-32-NEXT: xxlxor v3, v3, v3
+; P8-AIX-32-NEXT: stw r4, -16(r1)
+; P8-AIX-32-NEXT: addi r4, r1, -32
+; P8-AIX-32-NEXT: lwz r3, 0(r3)
+; P8-AIX-32-NEXT: stw r3, -32(r1)
+; P8-AIX-32-NEXT: addi r3, r1, -16
+; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
+; P8-AIX-32-NEXT: lxvw4x vs1, 0, r4
+; P8-AIX-32-NEXT: xxmrghw v2, vs1, vs0
; P8-AIX-32-NEXT: vmrghb v2, v3, v2
; P8-AIX-32-NEXT: blr
entry:
;
; P8-AIX-32-LABEL: no_crash_bitcast:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r4, L..C1(r2) # %const.0
+; P8-AIX-32-NEXT: lwz r4, L..C0(r2) # %const.0
; P8-AIX-32-NEXT: stw r3, -16(r1)
; P8-AIX-32-NEXT: addi r3, r1, -16
; P8-AIX-32-NEXT: lxvw4x v3, 0, r3
;
; P8-AIX-32-LABEL: replace_undefs_in_splat:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r3, L..C2(r2) # %const.0
-; P8-AIX-32-NEXT: lwz r4, L..C3(r2) # %const.1
+; P8-AIX-32-NEXT: lwz r3, L..C1(r2) # %const.0
+; P8-AIX-32-NEXT: lwz r4, L..C2(r2) # %const.1
; P8-AIX-32-NEXT: lxvw4x v3, 0, r3
; P8-AIX-32-NEXT: lxvw4x v4, 0, r4
; P8-AIX-32-NEXT: vperm v2, v2, v4, v3
;
; P8-AIX-32-LABEL: testSplat8:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r5, 4(r3)
-; P8-AIX-32-NEXT: lwz r4, L..C4(r2) # %const.0
-; P8-AIX-32-NEXT: stw r5, -32(r1)
+; P8-AIX-32-NEXT: lwz r4, 4(r3)
+; P8-AIX-32-NEXT: stw r4, -16(r1)
+; P8-AIX-32-NEXT: addi r4, r1, -32
; P8-AIX-32-NEXT: lwz r3, 0(r3)
-; P8-AIX-32-NEXT: lxvw4x v2, 0, r4
-; P8-AIX-32-NEXT: addi r4, r1, -16
-; P8-AIX-32-NEXT: stw r3, -16(r1)
-; P8-AIX-32-NEXT: addi r3, r1, -32
-; P8-AIX-32-NEXT: lxvw4x v3, 0, r3
-; P8-AIX-32-NEXT: lxvw4x v4, 0, r4
-; P8-AIX-32-NEXT: vperm v2, v4, v3, v2
-; P8-AIX-32-NEXT: xxmrghd v2, v2, v2
+; P8-AIX-32-NEXT: stw r3, -32(r1)
+; P8-AIX-32-NEXT: addi r3, r1, -16
+; P8-AIX-32-NEXT: lxvw4x vs0, 0, r3
+; P8-AIX-32-NEXT: lxvw4x vs1, 0, r4
+; P8-AIX-32-NEXT: xxmrghw vs0, vs1, vs0
+; P8-AIX-32-NEXT: xxmrghd v2, vs0, vs0
; P8-AIX-32-NEXT: blr
entry:
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
;
; P8-AIX-32-LABEL: testSplati64_0:
; P8-AIX-32: # %bb.0: # %entry
-; P8-AIX-32-NEXT: lwz r4, L..C5(r2) # %const.0
+; P8-AIX-32-NEXT: lwz r4, L..C3(r2) # %const.0
; P8-AIX-32-NEXT: lwz r5, 4(r3)
; P8-AIX-32-NEXT: lwz r3, 0(r3)
; P8-AIX-32-NEXT: stw r5, -16(r1)
;
; P9-AIX32-LABEL: unadjusted_lxvdsx:
; P9-AIX32: # %bb.0: # %entry
-; P9-AIX32-NEXT: lwz r4, 0(r3)
+; P9-AIX32-NEXT: lwz r4, 4(r3)
; P9-AIX32-NEXT: stw r4, -16(r1)
-; P9-AIX32-NEXT: lwz r3, 4(r3)
-; P9-AIX32-NEXT: lxv vs1, -16(r1)
-; P9-AIX32-NEXT: mtfprwz f0, r3
-; P9-AIX32-NEXT: xxinsertw vs1, vs0, 4
-; P9-AIX32-NEXT: xxmrghd v2, vs1, vs1
+; P9-AIX32-NEXT: lwz r3, 0(r3)
+; P9-AIX32-NEXT: lxv vs0, -16(r1)
+; P9-AIX32-NEXT: stw r3, -32(r1)
+; P9-AIX32-NEXT: lxv vs1, -32(r1)
+; P9-AIX32-NEXT: xxmrghw vs0, vs1, vs0
+; P9-AIX32-NEXT: xxmrghd v2, vs0, vs0
; P9-AIX32-NEXT: blr
;
; P8-AIX32-LABEL: unadjusted_lxvdsx:
; P8-AIX32: # %bb.0: # %entry
-; P8-AIX32-NEXT: lwz r5, 4(r3)
-; P8-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0
-; P8-AIX32-NEXT: stw r5, -32(r1)
+; P8-AIX32-NEXT: lwz r4, 4(r3)
+; P8-AIX32-NEXT: stw r4, -16(r1)
+; P8-AIX32-NEXT: addi r4, r1, -32
; P8-AIX32-NEXT: lwz r3, 0(r3)
-; P8-AIX32-NEXT: lxvw4x v2, 0, r4
-; P8-AIX32-NEXT: addi r4, r1, -16
-; P8-AIX32-NEXT: stw r3, -16(r1)
-; P8-AIX32-NEXT: addi r3, r1, -32
-; P8-AIX32-NEXT: lxvw4x v3, 0, r3
-; P8-AIX32-NEXT: lxvw4x v4, 0, r4
-; P8-AIX32-NEXT: vperm v2, v4, v3, v2
-; P8-AIX32-NEXT: xxmrghd v2, v2, v2
+; P8-AIX32-NEXT: stw r3, -32(r1)
+; P8-AIX32-NEXT: addi r3, r1, -16
+; P8-AIX32-NEXT: lxvw4x vs0, 0, r3
+; P8-AIX32-NEXT: lxvw4x vs1, 0, r4
+; P8-AIX32-NEXT: xxmrghw vs0, vs1, vs0
+; P8-AIX32-NEXT: xxmrghd v2, vs0, vs0
; P8-AIX32-NEXT: blr
;
; P7-AIX32-LABEL: unadjusted_lxvdsx:
; P7-AIX32: # %bb.0: # %entry
; P7-AIX32-NEXT: lwz r5, 4(r3)
-; P7-AIX32-NEXT: lwz r4, L..C3(r2) # %const.0
-; P7-AIX32-NEXT: stw r5, -32(r1)
-; P7-AIX32-NEXT: lwz r3, 0(r3)
-; P7-AIX32-NEXT: lxvw4x v2, 0, r4
; P7-AIX32-NEXT: addi r4, r1, -16
-; P7-AIX32-NEXT: stw r3, -16(r1)
+; P7-AIX32-NEXT: stw r5, -16(r1)
+; P7-AIX32-NEXT: lwz r3, 0(r3)
+; P7-AIX32-NEXT: stw r3, -32(r1)
; P7-AIX32-NEXT: addi r3, r1, -32
-; P7-AIX32-NEXT: lxvw4x v3, 0, r3
-; P7-AIX32-NEXT: lxvw4x v4, 0, r4
-; P7-AIX32-NEXT: vperm v2, v4, v3, v2
-; P7-AIX32-NEXT: xxmrghd v2, v2, v2
+; P7-AIX32-NEXT: lxvw4x vs0, 0, r4
+; P7-AIX32-NEXT: lxvw4x vs1, 0, r3
+; P7-AIX32-NEXT: xxmrghw vs0, vs1, vs0
+; P7-AIX32-NEXT: xxmrghd v2, vs0, vs0
; P7-AIX32-NEXT: blr
entry:
%0 = bitcast i64* %s to <8 x i8>*
; P9BE-AIX32-LABEL: test64:
; P9BE-AIX32: # %bb.0: # %entry
; P9BE-AIX32-NEXT: lwzux 4, 3, 4
-; P9BE-AIX32-NEXT: lwz 5, L..C0(2) # %const.0
; P9BE-AIX32-NEXT: xxlxor 4, 4, 4
-; P9BE-AIX32-NEXT: lxv 3, 0(5)
-; P9BE-AIX32-NEXT: stw 4, -32(1)
+; P9BE-AIX32-NEXT: stw 4, -48(1)
; P9BE-AIX32-NEXT: lwz 4, 4(3)
-; P9BE-AIX32-NEXT: lxv 2, -32(1)
-; P9BE-AIX32-NEXT: stw 4, -16(1)
-; P9BE-AIX32-NEXT: mtfprwz 0, 4
+; P9BE-AIX32-NEXT: lxv 0, -48(1)
+; P9BE-AIX32-NEXT: stw 4, -32(1)
+; P9BE-AIX32-NEXT: lwz 4, L..C0(2) # %const.0
; P9BE-AIX32-NEXT: lwz 3, 8(3)
-; P9BE-AIX32-NEXT: xxinsertw 2, 0, 4
-; P9BE-AIX32-NEXT: mtfprwz 0, 3
+; P9BE-AIX32-NEXT: lxv 1, -32(1)
+; P9BE-AIX32-NEXT: lxv 3, 0(4)
+; P9BE-AIX32-NEXT: stw 3, -16(1)
; P9BE-AIX32-NEXT: lwz 3, L..C1(2) # %const.1
+; P9BE-AIX32-NEXT: xxmrghw 2, 0, 1
+; P9BE-AIX32-NEXT: lxv 0, -16(1)
; P9BE-AIX32-NEXT: vperm 2, 4, 2, 3
-; P9BE-AIX32-NEXT: lxv 3, -16(1)
; P9BE-AIX32-NEXT: lxv 4, 0(3)
-; P9BE-AIX32-NEXT: xxinsertw 3, 0, 4
+; P9BE-AIX32-NEXT: xxmrghw 3, 1, 0
; P9BE-AIX32-NEXT: vperm 3, 3, 3, 4
; P9BE-AIX32-NEXT: vspltisw 4, 8
; P9BE-AIX32-NEXT: vnegw 3, 3
;
; AIX-32-LABEL: test2:
; AIX-32: # %bb.0: # %entry
-; AIX-32-NEXT: lwz r5, L..C0(r2) # %const.0
-; AIX-32-NEXT: li r6, 4
+; AIX-32-NEXT: li r5, 4
; AIX-32-NEXT: lxsiwzx v3, 0, r3
-; AIX-32-NEXT: lxsiwzx v0, 0, r4
-; AIX-32-NEXT: lxsiwzx v2, r3, r6
-; AIX-32-NEXT: lxsiwzx v5, r4, r6
-; AIX-32-NEXT: lxvw4x v4, 0, r5
-; AIX-32-NEXT: vperm v2, v3, v2, v4
-; AIX-32-NEXT: vperm v3, v0, v5, v4
+; AIX-32-NEXT: lxsiwzx v5, 0, r4
+; AIX-32-NEXT: lxsiwzx v2, r3, r5
+; AIX-32-NEXT: lxsiwzx v4, r4, r5
+; AIX-32-NEXT: vmrgow v2, v3, v2
+; AIX-32-NEXT: vmrgow v3, v5, v4
; AIX-32-NEXT: xvsubsp vs0, v2, v3
; AIX-32-NEXT: xxsldwi vs1, vs0, vs0, 1
; AIX-32-NEXT: xscvspdpn f0, vs0
;
; AIX-32-LABEL: test3:
; AIX-32: # %bb.0: # %entry
-; AIX-32-NEXT: lwz r5, L..C1(r2) # %const.0
-; AIX-32-NEXT: li r6, 4
+; AIX-32-NEXT: li r5, 4
; AIX-32-NEXT: lxsiwzx v3, 0, r3
-; AIX-32-NEXT: lxsiwzx v0, 0, r4
-; AIX-32-NEXT: lxsiwzx v2, r3, r6
-; AIX-32-NEXT: lxsiwzx v5, r4, r6
-; AIX-32-NEXT: lxvw4x v4, 0, r5
-; AIX-32-NEXT: vperm v2, v3, v2, v4
-; AIX-32-NEXT: vperm v3, v0, v5, v4
+; AIX-32-NEXT: lxsiwzx v5, 0, r4
+; AIX-32-NEXT: lxsiwzx v2, r3, r5
+; AIX-32-NEXT: lxsiwzx v4, r4, r5
+; AIX-32-NEXT: vmrgow v2, v3, v2
+; AIX-32-NEXT: vmrgow v3, v5, v4
; AIX-32-NEXT: xvaddsp vs0, v2, v3
; AIX-32-NEXT: xxsldwi vs1, vs0, vs0, 1
; AIX-32-NEXT: xscvspdpn f0, vs0
;
; AIX-32-LABEL: test4:
; AIX-32: # %bb.0: # %entry
-; AIX-32-NEXT: lwz r5, L..C2(r2) # %const.0
-; AIX-32-NEXT: li r6, 4
+; AIX-32-NEXT: li r5, 4
; AIX-32-NEXT: lxsiwzx v3, 0, r3
-; AIX-32-NEXT: lxsiwzx v0, 0, r4
-; AIX-32-NEXT: lxsiwzx v2, r3, r6
-; AIX-32-NEXT: lxsiwzx v5, r4, r6
-; AIX-32-NEXT: lxvw4x v4, 0, r5
-; AIX-32-NEXT: vperm v2, v3, v2, v4
-; AIX-32-NEXT: vperm v3, v0, v5, v4
+; AIX-32-NEXT: lxsiwzx v5, 0, r4
+; AIX-32-NEXT: lxsiwzx v2, r3, r5
+; AIX-32-NEXT: lxsiwzx v4, r4, r5
+; AIX-32-NEXT: vmrgow v2, v3, v2
+; AIX-32-NEXT: vmrgow v3, v5, v4
; AIX-32-NEXT: xvmulsp vs0, v2, v3
; AIX-32-NEXT: xxsldwi vs1, vs0, vs0, 1
; AIX-32-NEXT: xscvspdpn f0, vs0
;
; AIX-32-LABEL: test5:
; AIX-32: # %bb.0: # %entry
-; AIX-32-NEXT: lwz r3, L..C3(r2) # @G
+; AIX-32-NEXT: lwz r3, L..C0(r2) # @G
; AIX-32-NEXT: lfs f0, 4(r3)
; AIX-32-NEXT: lfs f1, 0(r3)
; AIX-32-NEXT: xxmrghd vs0, vs1, vs0
;
; AIX-32-LABEL: test6:
; AIX-32: # %bb.0: # %bb
-; AIX-32-NEXT: lwz r3, L..C4(r2) # @Glob1
+; AIX-32-NEXT: lwz r3, L..C1(r2) # @Glob1
; AIX-32-NEXT: lis r4, 8
; AIX-32-NEXT: ori r4, r4, 38248
; AIX-32-NEXT: lfsux f0, r3, r4
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrd r2, r3, [r0]
; CHECK-NEXT: ldr r0, [r0, #8]
-; CHECK-NEXT: vmov.32 q0[0], r2
; CHECK-NEXT: vmov.32 q0[1], r3
-; CHECK-NEXT: vmov.32 q0[2], r0
+; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vmovx.f16 s4, s2
; CHECK-NEXT: vins.f16 s8, s2
define void @vst3_v4f16(<4 x half> *%src, <12 x half> *%dst) {
; CHECK-LABEL: vst3_v4f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: ldrd r2, r12, [r0]
-; CHECK-NEXT: ldrd r3, lr, [r0, #8]
-; CHECK-NEXT: vmov.32 q0[0], r2
-; CHECK-NEXT: ldrd r2, r0, [r0, #16]
-; CHECK-NEXT: vmov.32 q1[0], r3
-; CHECK-NEXT: vmov.32 q0[1], r12
-; CHECK-NEXT: vmov.32 q1[1], lr
-; CHECK-NEXT: vmov.f32 s8, s1
-; CHECK-NEXT: vmov.f32 s3, s5
-; CHECK-NEXT: vins.f16 s8, s5
-; CHECK-NEXT: vmov.f32 s2, s4
-; CHECK-NEXT: vmov.32 q1[0], r2
+; CHECK-NEXT: .save {r4, lr}
+; CHECK-NEXT: push {r4, lr}
+; CHECK-NEXT: ldrd lr, r12, [r0]
+; CHECK-NEXT: ldrd r3, r2, [r0, #8]
+; CHECK-NEXT: ldrd r4, r0, [r0, #16]
+; CHECK-NEXT: vmov q0[2], q0[0], lr, r3
+; CHECK-NEXT: vmov.32 q1[0], r4
+; CHECK-NEXT: vmov q0[3], q0[1], r12, r2
; CHECK-NEXT: vmov.32 q1[1], r0
-; CHECK-NEXT: vmovx.f16 s13, s3
+; CHECK-NEXT: vmovx.f16 s9, s3
; CHECK-NEXT: vmovx.f16 s6, s0
; CHECK-NEXT: vins.f16 s0, s2
-; CHECK-NEXT: vmovx.f16 s10, s4
+; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vins.f16 s4, s6
; CHECK-NEXT: vmovx.f16 s6, s1
-; CHECK-NEXT: vins.f16 s2, s10
-; CHECK-NEXT: vmovx.f16 s10, s5
+; CHECK-NEXT: vins.f16 s2, s8
+; CHECK-NEXT: vmovx.f16 s8, s5
; CHECK-NEXT: vins.f16 s5, s6
-; CHECK-NEXT: vins.f16 s13, s10
-; CHECK-NEXT: vmov.f32 s12, s5
-; CHECK-NEXT: vmov.f32 s1, s4
-; CHECK-NEXT: vmov.f32 s3, s8
-; CHECK-NEXT: vstrw.32 q0, [r1]
-; CHECK-NEXT: vmov r0, r2, d6
+; CHECK-NEXT: vins.f16 s9, s8
+; CHECK-NEXT: vmov.f32 s8, s5
+; CHECK-NEXT: vins.f16 s1, s3
+; CHECK-NEXT: vmov r0, r2, d4
+; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vmov.f32 s9, s4
+; CHECK-NEXT: vmov.f32 s10, s2
+; CHECK-NEXT: vmov.f32 s11, s1
+; CHECK-NEXT: vstrw.32 q2, [r1]
; CHECK-NEXT: strd r0, r2, [r1, #16]
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: pop {r4, pc}
entry:
%s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
%l1 = load <4 x half>, <4 x half>* %s1, align 4
define void @vst4_v4f16(<4 x half> *%src, <16 x half> *%dst) {
; CHECK-LABEL: vst4_v4f16:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r7, lr}
-; CHECK-NEXT: push {r7, lr}
-; CHECK-NEXT: add.w lr, r0, #16
-; CHECK-NEXT: ldr r2, [r0, #28]
-; CHECK-NEXT: ldm.w lr, {r3, r12, lr}
-; CHECK-NEXT: vmov.32 q1[0], lr
-; CHECK-NEXT: vmov.32 q1[1], r2
-; CHECK-NEXT: vmov.32 q0[0], r3
-; CHECK-NEXT: vmov.32 q0[1], r12
-; CHECK-NEXT: ldrd r2, r12, [r0]
-; CHECK-NEXT: ldrd r3, r0, [r0, #8]
+; CHECK-NEXT: .save {r4, r5, r6, lr}
+; CHECK-NEXT: push {r4, r5, r6, lr}
+; CHECK-NEXT: add.w r6, r0, #16
+; CHECK-NEXT: ldrd lr, r12, [r0]
+; CHECK-NEXT: ldrd r3, r2, [r0, #8]
+; CHECK-NEXT: ldm r6, {r4, r5, r6}
+; CHECK-NEXT: vmov q1[2], q1[0], lr, r3
+; CHECK-NEXT: ldr r0, [r0, #28]
+; CHECK-NEXT: vmov q1[3], q1[1], r12, r2
+; CHECK-NEXT: vmov q0[2], q0[0], r4, r6
+; CHECK-NEXT: vmovx.f16 s10, s5
+; CHECK-NEXT: vmov q0[3], q0[1], r5, r0
+; CHECK-NEXT: vins.f16 s5, s7
; CHECK-NEXT: vmovx.f16 s12, s0
-; CHECK-NEXT: vmovx.f16 s2, s4
-; CHECK-NEXT: vmov.f32 s3, s5
-; CHECK-NEXT: vmov.32 q2[0], r3
-; CHECK-NEXT: vins.f16 s0, s4
-; CHECK-NEXT: vmov.32 q1[0], r2
-; CHECK-NEXT: vmov.32 q2[1], r0
-; CHECK-NEXT: vmov.32 q1[1], r12
-; CHECK-NEXT: vins.f16 s12, s2
-; CHECK-NEXT: vmovx.f16 s6, s4
-; CHECK-NEXT: vmovx.f16 s2, s8
-; CHECK-NEXT: vins.f16 s6, s2
+; CHECK-NEXT: vins.f16 s0, s2
+; CHECK-NEXT: vmovx.f16 s2, s2
; CHECK-NEXT: vmovx.f16 s11, s1
+; CHECK-NEXT: vins.f16 s12, s2
; CHECK-NEXT: vmovx.f16 s2, s3
-; CHECK-NEXT: vmovx.f16 s10, s5
; CHECK-NEXT: vins.f16 s11, s2
-; CHECK-NEXT: vmovx.f16 s2, s9
+; CHECK-NEXT: vmovx.f16 s2, s4
+; CHECK-NEXT: vins.f16 s4, s6
+; CHECK-NEXT: vmovx.f16 s6, s6
; CHECK-NEXT: vins.f16 s1, s3
-; CHECK-NEXT: vins.f16 s5, s9
-; CHECK-NEXT: vins.f16 s4, s8
+; CHECK-NEXT: vins.f16 s2, s6
+; CHECK-NEXT: vmovx.f16 s6, s7
; CHECK-NEXT: vmov.f32 s8, s5
-; CHECK-NEXT: vins.f16 s10, s2
+; CHECK-NEXT: vins.f16 s10, s6
; CHECK-NEXT: vmov.f32 s9, s1
; CHECK-NEXT: vmov.f32 s5, s0
; CHECK-NEXT: vstrh.16 q2, [r1, #16]
+; CHECK-NEXT: vmov.f32 s6, s2
; CHECK-NEXT: vmov.f32 s7, s12
; CHECK-NEXT: vstrh.16 q1, [r1]
-; CHECK-NEXT: pop {r7, pc}
+; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%s1 = getelementptr <4 x half>, <4 x half>* %src, i32 0
%l1 = load <4 x half>, <4 x half>* %s1, align 4