; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f32 s7, s3
-; CHECK-NEXT: vmov.f32 s16, s9
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.f32 s17, s12
-; CHECK-NEXT: vmov.f32 s9, s11
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s10, s14
-; CHECK-NEXT: vmov.f32 s19, s2
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s5, s13
-; CHECK-NEXT: vadd.i32 q0, q2, q4
-; CHECK-NEXT: vadd.i32 q0, q0, q1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vmov.f64 d6, d2
+; CHECK-NEXT: vmov.f32 s16, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov.f32 s17, s0
+; CHECK-NEXT: vmov.f32 s14, s2
+; CHECK-NEXT: vmov.f32 s18, s3
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s15, s9
+; CHECK-NEXT: vadd.i32 q3, q3, q4
+; CHECK-NEXT: vmov.f32 s3, s11
+; CHECK-NEXT: vadd.i32 q0, q3, q0
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f32 s7, s3
-; CHECK-NEXT: vmov.f32 s16, s9
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.f32 s17, s12
-; CHECK-NEXT: vmov.f32 s9, s11
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s10, s14
-; CHECK-NEXT: vmov.f32 s19, s2
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s5, s13
-; CHECK-NEXT: vadd.i32 q0, q2, q4
-; CHECK-NEXT: vadd.i32 q0, q0, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT: vmov.f64 d6, d2
+; CHECK-NEXT: vmov.f32 s16, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov.f32 s17, s0
+; CHECK-NEXT: vmov.f32 s14, s2
+; CHECK-NEXT: vmov.f32 s18, s3
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s15, s9
+; CHECK-NEXT: vmov.f32 s3, s11
+; CHECK-NEXT: vadd.i32 q3, q3, q4
+; CHECK-NEXT: vadd.i32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT: vmov.f64 d5, d2
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
-; CHECK-NEXT: vmov.f32 s11, s7
-; CHECK-NEXT: vmov.f32 s20, s13
-; CHECK-NEXT: vmov.f32 s8, s14
-; CHECK-NEXT: vmov.f32 s21, s16
-; CHECK-NEXT: vmov.f32 s13, s15
-; CHECK-NEXT: vmov.f32 s22, s19
-; CHECK-NEXT: vmov.f32 s14, s18
-; CHECK-NEXT: vmov.f32 s23, s6
-; CHECK-NEXT: vmov.f32 s15, s5
-; CHECK-NEXT: vmov.f32 s9, s17
-; CHECK-NEXT: vadd.i32 q1, q3, q5
-; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vmov.f32 s16, s13
+; CHECK-NEXT: vmov.f64 d10, d6
+; CHECK-NEXT: vmov.f32 s17, s4
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmov.f32 s18, s7
+; CHECK-NEXT: vmov.f32 s22, s6
+; CHECK-NEXT: vmov.f32 s4, s14
+; CHECK-NEXT: vmov.f32 s6, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s23, s9
+; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vadd.i32 q1, q4, q1
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f32 s7, s3
-; CHECK-NEXT: vmov.f32 s16, s9
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.f32 s17, s12
-; CHECK-NEXT: vmov.f32 s9, s11
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s10, s14
-; CHECK-NEXT: vmov.f32 s19, s2
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s5, s13
-; CHECK-NEXT: vadd.i32 q0, q2, q4
-; CHECK-NEXT: vadd.i32 q0, q0, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT: vmov.f64 d6, d2
+; CHECK-NEXT: vmov.f32 s16, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov.f32 s17, s0
+; CHECK-NEXT: vmov.f32 s14, s2
+; CHECK-NEXT: vmov.f32 s18, s3
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s15, s9
+; CHECK-NEXT: vmov.f32 s3, s11
+; CHECK-NEXT: vadd.i32 q3, q3, q4
+; CHECK-NEXT: vadd.i32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT: vmov.f64 d5, d2
-; CHECK-NEXT: vmov.f32 s11, s7
-; CHECK-NEXT: vmov.f32 s20, s13
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vmov.f32 s16, s13
+; CHECK-NEXT: vmov.f64 d10, d6
+; CHECK-NEXT: vmov.f32 s17, s4
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmov.f32 s18, s7
+; CHECK-NEXT: vmov.f32 s22, s6
+; CHECK-NEXT: vmov.f32 s4, s14
+; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT: vmov.f32 s6, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s23, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT: vadd.i32 q4, q5, q4
+; CHECK-NEXT: vmov.f64 d10, d6
+; CHECK-NEXT: vadd.i32 q1, q4, q1
+; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
+; CHECK-NEXT: vmov.f32 s24, s13
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmov.f32 s25, s8
+; CHECK-NEXT: vmov.f32 s22, s10
+; CHECK-NEXT: vmov.f32 s26, s11
; CHECK-NEXT: vmov.f32 s8, s14
-; CHECK-NEXT: vmov.f32 s21, s16
-; CHECK-NEXT: vmov.f32 s13, s15
-; CHECK-NEXT: vmov.f32 s22, s19
-; CHECK-NEXT: vmov.f32 s14, s18
-; CHECK-NEXT: vmov.f32 s23, s6
-; CHECK-NEXT: vmov.f32 s15, s5
-; CHECK-NEXT: vmov.f32 s9, s17
-; CHECK-NEXT: vadd.i32 q1, q3, q5
-; CHECK-NEXT: vadd.i32 q1, q1, q2
-; CHECK-NEXT: vldrw.u32 q2, [r0, #176]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #144]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #160]
-; CHECK-NEXT: vmov.f64 d7, d4
-; CHECK-NEXT: vmov.f32 s15, s11
-; CHECK-NEXT: vmov.f32 s24, s17
-; CHECK-NEXT: vmov.f32 s12, s18
-; CHECK-NEXT: vmov.f32 s25, s20
-; CHECK-NEXT: vmov.f32 s17, s19
-; CHECK-NEXT: vmov.f32 s26, s23
-; CHECK-NEXT: vmov.f32 s18, s22
-; CHECK-NEXT: vmov.f32 s27, s10
-; CHECK-NEXT: vmov.f32 s19, s9
-; CHECK-NEXT: vmov.f32 s13, s21
-; CHECK-NEXT: vadd.i32 q2, q4, q6
-; CHECK-NEXT: vadd.i32 q2, q2, q3
-; CHECK-NEXT: vldrw.u32 q3, [r0, #128]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT: vmov.f32 s10, s16
+; CHECK-NEXT: vmov.f32 s27, s18
+; CHECK-NEXT: vmov.f32 s23, s17
+; CHECK-NEXT: vmov.f32 s11, s19
+; CHECK-NEXT: vadd.i32 q5, q5, q6
+; CHECK-NEXT: vadd.i32 q2, q5, q2
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #112]
-; CHECK-NEXT: vmov.f64 d9, d6
+; CHECK-NEXT: vldrw.u32 q4, [r0, #128]
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
+; CHECK-NEXT: vmov.f32 s24, s21
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
+; CHECK-NEXT: vmov.f64 d14, d10
; CHECK-NEXT: vstrw.32 q1, [r1]
-; CHECK-NEXT: vmov.f32 s19, s15
-; CHECK-NEXT: vmov.f32 s28, s21
-; CHECK-NEXT: vmov.f32 s16, s22
-; CHECK-NEXT: vmov.f32 s29, s24
-; CHECK-NEXT: vmov.f32 s21, s23
-; CHECK-NEXT: vmov.f32 s30, s27
-; CHECK-NEXT: vmov.f32 s22, s26
-; CHECK-NEXT: vmov.f32 s31, s14
-; CHECK-NEXT: vmov.f32 s23, s13
-; CHECK-NEXT: vmov.f32 s17, s25
-; CHECK-NEXT: vadd.i32 q3, q5, q7
-; CHECK-NEXT: vadd.i32 q3, q3, q4
+; CHECK-NEXT: vmov.f32 s25, s12
+; CHECK-NEXT: vmov.f32 s29, s23
+; CHECK-NEXT: vmov.f32 s26, s15
+; CHECK-NEXT: vmov.f32 s30, s14
+; CHECK-NEXT: vmov.f32 s12, s22
+; CHECK-NEXT: vmov.f32 s14, s16
+; CHECK-NEXT: vmov.f32 s27, s18
+; CHECK-NEXT: vmov.f32 s31, s17
+; CHECK-NEXT: vadd.i32 q6, q7, q6
+; CHECK-NEXT: vmov.f32 s15, s19
+; CHECK-NEXT: vadd.i32 q3, q6, q3
; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: vldrw.u32 q0, [r0, #32]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #16]
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f32 s7, s3
-; CHECK-NEXT: vmov.f32 s16, s9
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.f32 s17, s12
-; CHECK-NEXT: vmov.f32 s9, s11
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s10, s14
-; CHECK-NEXT: vmov.f32 s19, s2
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s5, s13
-; CHECK-NEXT: vadd.f32 q0, q2, q4
-; CHECK-NEXT: vadd.f32 q0, q0, q1
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vmov.f64 d6, d2
+; CHECK-NEXT: vmov.f32 s16, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov.f32 s17, s0
+; CHECK-NEXT: vmov.f32 s14, s2
+; CHECK-NEXT: vmov.f32 s18, s3
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s15, s9
+; CHECK-NEXT: vadd.f32 q3, q3, q4
+; CHECK-NEXT: vmov.f32 s3, s11
+; CHECK-NEXT: vadd.f32 q0, q3, q0
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f32 s7, s3
-; CHECK-NEXT: vmov.f32 s16, s9
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.f32 s17, s12
-; CHECK-NEXT: vmov.f32 s9, s11
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s10, s14
-; CHECK-NEXT: vmov.f32 s19, s2
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s5, s13
-; CHECK-NEXT: vadd.f32 q0, q2, q4
-; CHECK-NEXT: vadd.f32 q0, q0, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT: vmov.f64 d6, d2
+; CHECK-NEXT: vmov.f32 s16, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov.f32 s17, s0
+; CHECK-NEXT: vmov.f32 s14, s2
+; CHECK-NEXT: vmov.f32 s18, s3
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s15, s9
+; CHECK-NEXT: vmov.f32 s3, s11
+; CHECK-NEXT: vadd.f32 q3, q3, q4
+; CHECK-NEXT: vadd.f32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT: vmov.f64 d5, d2
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
-; CHECK-NEXT: vmov.f32 s11, s7
-; CHECK-NEXT: vmov.f32 s20, s13
-; CHECK-NEXT: vmov.f32 s8, s14
-; CHECK-NEXT: vmov.f32 s21, s16
-; CHECK-NEXT: vmov.f32 s13, s15
-; CHECK-NEXT: vmov.f32 s22, s19
-; CHECK-NEXT: vmov.f32 s14, s18
-; CHECK-NEXT: vmov.f32 s23, s6
-; CHECK-NEXT: vmov.f32 s15, s5
-; CHECK-NEXT: vmov.f32 s9, s17
-; CHECK-NEXT: vadd.f32 q1, q3, q5
-; CHECK-NEXT: vadd.f32 q1, q1, q2
+; CHECK-NEXT: vmov.f32 s16, s13
+; CHECK-NEXT: vmov.f64 d10, d6
+; CHECK-NEXT: vmov.f32 s17, s4
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmov.f32 s18, s7
+; CHECK-NEXT: vmov.f32 s22, s6
+; CHECK-NEXT: vmov.f32 s4, s14
+; CHECK-NEXT: vmov.f32 s6, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s23, s9
+; CHECK-NEXT: vadd.f32 q4, q5, q4
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vadd.f32 q1, q4, q1
; CHECK-NEXT: vstrw.32 q1, [r1]
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: vldrw.u32 q0, [r0, #80]
-; CHECK-NEXT: vldrw.u32 q2, [r0, #48]
-; CHECK-NEXT: vldrw.u32 q3, [r0, #64]
-; CHECK-NEXT: vmov.f64 d3, d0
-; CHECK-NEXT: vmov.f32 s7, s3
-; CHECK-NEXT: vmov.f32 s16, s9
-; CHECK-NEXT: vmov.f32 s4, s10
-; CHECK-NEXT: vmov.f32 s17, s12
-; CHECK-NEXT: vmov.f32 s9, s11
-; CHECK-NEXT: vmov.f32 s18, s15
-; CHECK-NEXT: vmov.f32 s10, s14
-; CHECK-NEXT: vmov.f32 s19, s2
-; CHECK-NEXT: vmov.f32 s11, s1
-; CHECK-NEXT: vmov.f32 s5, s13
-; CHECK-NEXT: vadd.f32 q0, q2, q4
-; CHECK-NEXT: vadd.f32 q0, q0, q1
-; CHECK-NEXT: vldrw.u32 q1, [r0, #32]
+; CHECK-NEXT: vldrw.u32 q1, [r0, #48]
+; CHECK-NEXT: vldrw.u32 q0, [r0, #64]
+; CHECK-NEXT: vldrw.u32 q2, [r0, #80]
+; CHECK-NEXT: vmov.f64 d6, d2
+; CHECK-NEXT: vmov.f32 s16, s5
+; CHECK-NEXT: vmov.f32 s13, s7
+; CHECK-NEXT: vmov.f32 s17, s0
+; CHECK-NEXT: vmov.f32 s14, s2
+; CHECK-NEXT: vmov.f32 s18, s3
+; CHECK-NEXT: vmov.f32 s0, s6
+; CHECK-NEXT: vldrw.u32 q1, [r0, #16]
+; CHECK-NEXT: vmov.f32 s2, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s15, s9
+; CHECK-NEXT: vmov.f32 s3, s11
+; CHECK-NEXT: vadd.f32 q3, q3, q4
+; CHECK-NEXT: vadd.f32 q0, q3, q0
; CHECK-NEXT: vldrw.u32 q3, [r0]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #16]
-; CHECK-NEXT: vmov.f64 d5, d2
-; CHECK-NEXT: vmov.f32 s11, s7
-; CHECK-NEXT: vmov.f32 s20, s13
+; CHECK-NEXT: vldrw.u32 q2, [r0, #32]
+; CHECK-NEXT: vmov.f32 s16, s13
+; CHECK-NEXT: vmov.f64 d10, d6
+; CHECK-NEXT: vmov.f32 s17, s4
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmov.f32 s18, s7
+; CHECK-NEXT: vmov.f32 s22, s6
+; CHECK-NEXT: vmov.f32 s4, s14
+; CHECK-NEXT: vldrw.u32 q3, [r0, #144]
+; CHECK-NEXT: vmov.f32 s6, s8
+; CHECK-NEXT: vmov.f32 s19, s10
+; CHECK-NEXT: vmov.f32 s23, s9
+; CHECK-NEXT: vmov.f32 s7, s11
+; CHECK-NEXT: vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT: vadd.f32 q4, q5, q4
+; CHECK-NEXT: vmov.f64 d10, d6
+; CHECK-NEXT: vadd.f32 q1, q4, q1
+; CHECK-NEXT: vldrw.u32 q4, [r0, #176]
+; CHECK-NEXT: vmov.f32 s24, s13
+; CHECK-NEXT: vmov.f32 s21, s15
+; CHECK-NEXT: vmov.f32 s25, s8
+; CHECK-NEXT: vmov.f32 s22, s10
+; CHECK-NEXT: vmov.f32 s26, s11
; CHECK-NEXT: vmov.f32 s8, s14
-; CHECK-NEXT: vmov.f32 s21, s16
-; CHECK-NEXT: vmov.f32 s13, s15
-; CHECK-NEXT: vmov.f32 s22, s19
-; CHECK-NEXT: vmov.f32 s14, s18
-; CHECK-NEXT: vmov.f32 s23, s6
-; CHECK-NEXT: vmov.f32 s15, s5
-; CHECK-NEXT: vmov.f32 s9, s17
-; CHECK-NEXT: vadd.f32 q1, q3, q5
-; CHECK-NEXT: vadd.f32 q1, q1, q2
-; CHECK-NEXT: vldrw.u32 q2, [r0, #176]
-; CHECK-NEXT: vldrw.u32 q4, [r0, #144]
-; CHECK-NEXT: vldrw.u32 q5, [r0, #160]
-; CHECK-NEXT: vmov.f64 d7, d4
-; CHECK-NEXT: vmov.f32 s15, s11
-; CHECK-NEXT: vmov.f32 s24, s17
-; CHECK-NEXT: vmov.f32 s12, s18
-; CHECK-NEXT: vmov.f32 s25, s20
-; CHECK-NEXT: vmov.f32 s17, s19
-; CHECK-NEXT: vmov.f32 s26, s23
-; CHECK-NEXT: vmov.f32 s18, s22
-; CHECK-NEXT: vmov.f32 s27, s10
-; CHECK-NEXT: vmov.f32 s19, s9
-; CHECK-NEXT: vmov.f32 s13, s21
-; CHECK-NEXT: vadd.f32 q2, q4, q6
-; CHECK-NEXT: vadd.f32 q2, q2, q3
-; CHECK-NEXT: vldrw.u32 q3, [r0, #128]
+; CHECK-NEXT: vldrw.u32 q3, [r0, #112]
+; CHECK-NEXT: vmov.f32 s10, s16
+; CHECK-NEXT: vmov.f32 s27, s18
+; CHECK-NEXT: vmov.f32 s23, s17
+; CHECK-NEXT: vmov.f32 s11, s19
+; CHECK-NEXT: vadd.f32 q5, q5, q6
+; CHECK-NEXT: vadd.f32 q2, q5, q2
; CHECK-NEXT: vldrw.u32 q5, [r0, #96]
-; CHECK-NEXT: vldrw.u32 q6, [r0, #112]
-; CHECK-NEXT: vmov.f64 d9, d6
+; CHECK-NEXT: vldrw.u32 q4, [r0, #128]
; CHECK-NEXT: vstrw.32 q2, [r1, #48]
+; CHECK-NEXT: vmov.f32 s24, s21
; CHECK-NEXT: vstrw.32 q0, [r1, #16]
+; CHECK-NEXT: vmov.f64 d14, d10
; CHECK-NEXT: vstrw.32 q1, [r1]
-; CHECK-NEXT: vmov.f32 s19, s15
-; CHECK-NEXT: vmov.f32 s28, s21
-; CHECK-NEXT: vmov.f32 s16, s22
-; CHECK-NEXT: vmov.f32 s29, s24
-; CHECK-NEXT: vmov.f32 s21, s23
-; CHECK-NEXT: vmov.f32 s30, s27
-; CHECK-NEXT: vmov.f32 s22, s26
-; CHECK-NEXT: vmov.f32 s31, s14
-; CHECK-NEXT: vmov.f32 s23, s13
-; CHECK-NEXT: vmov.f32 s17, s25
-; CHECK-NEXT: vadd.f32 q3, q5, q7
-; CHECK-NEXT: vadd.f32 q3, q3, q4
+; CHECK-NEXT: vmov.f32 s25, s12
+; CHECK-NEXT: vmov.f32 s29, s23
+; CHECK-NEXT: vmov.f32 s26, s15
+; CHECK-NEXT: vmov.f32 s30, s14
+; CHECK-NEXT: vmov.f32 s12, s22
+; CHECK-NEXT: vmov.f32 s14, s16
+; CHECK-NEXT: vmov.f32 s27, s18
+; CHECK-NEXT: vmov.f32 s31, s17
+; CHECK-NEXT: vadd.f32 q6, q7, q6
+; CHECK-NEXT: vmov.f32 s15, s19
+; CHECK-NEXT: vadd.f32 q3, q6, q3
; CHECK-NEXT: vstrw.32 q3, [r1, #32]
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
; CHECK-NEXT: vmovaps %ymm4, %ymm10
; CHECK-NEXT: vmovaps %ymm3, %ymm9
; CHECK-NEXT: vmovaps %ymm1, %ymm8
-; CHECK-NEXT: vmovaps %ymm0, %ymm4
+; CHECK-NEXT: vmovaps %ymm0, %ymm3
; CHECK-NEXT: vmovaps 240(%rbp), %ymm1
-; CHECK-NEXT: vmovaps 208(%rbp), %ymm3
+; CHECK-NEXT: vmovaps 208(%rbp), %ymm4
; CHECK-NEXT: vmovaps 176(%rbp), %ymm0
; CHECK-NEXT: vmovaps 144(%rbp), %ymm0
; CHECK-NEXT: vmovaps 112(%rbp), %ymm11
; CHECK-NEXT: vmovaps 80(%rbp), %ymm11
; CHECK-NEXT: vmovaps 48(%rbp), %ymm11
; CHECK-NEXT: vmovaps 16(%rbp), %ymm11
-; CHECK-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vmovaps %xmm3, %xmm8
+; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7]
+; CHECK-NEXT: vmovaps %xmm4, %xmm6
; CHECK-NEXT: # implicit-def: $ymm2
-; CHECK-NEXT: vinserti128 $1, %xmm8, %ymm2, %ymm2
-; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm4[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
+; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2
+; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0]
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7]
-; CHECK-NEXT: vmovaps %xmm7, %xmm2
-; CHECK-NEXT: vpslldq {{.*#+}} xmm4 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
+; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero
; CHECK-NEXT: # implicit-def: $ymm2
-; CHECK-NEXT: vmovaps %xmm4, %xmm2
-; CHECK-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
-; CHECK-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,3]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3],ymm3[4,5,6,7]
+; CHECK-NEXT: vmovaps %xmm6, %xmm2
+; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3
+; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; CHECK-NEXT: vmovaps %xmm7, %xmm3
+; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: # implicit-def: $ymm3
+; CHECK-NEXT: vmovaps %xmm6, %xmm3
+; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
+; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7]
; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3]
-; CHECK-NEXT: vpshufd {{.*#+}} ymm2 = ymm5[0,1,0,1,4,5,4,5]
-; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
-; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2
-; CHECK-NEXT: vmovq {{.*#+}} xmm4 = xmm2[0],zero
-; CHECK-NEXT: # implicit-def: $ymm2
-; CHECK-NEXT: vmovaps %xmm4, %xmm2
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm6[0,1]
+; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5]
+; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
; CHECK-NEXT: movq %rbp, %rsp
; CHECK-NEXT: popq %rbp
; CHECK-NEXT: .cfi_def_cfa %rsp, 8