; V60-NEXT: v1:0.w = vmpy(v1.h,v0.h)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: r7 = #-4
+; V60-NEXT: r7:6 = combine(#64,#68)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v1:0 = vdeal(v1,v0,r7)
+; V60-NEXT: r5 = #120
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v0.h = vpacko(v1.w,v0.w)
+; V60-NEXT: v1:0 = vshuff(v1,v0,r7)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v3:2 = vdeal(v0,v0,r6)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v31:30 = vdeal(v0,v1,r6)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v3:2 = vshuff(v3,v2,r5)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v1:0 = vshuff(v31,v30,r5)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v0.h = vpacko(v0.w,v2.w)
; V60-NEXT: }
; V60-NEXT: {
; V60-NEXT: jumpr r31
; V65-NEXT: v1:0.w = vmpy(v1.h,v0.h)
; V65-NEXT: }
; V65-NEXT: {
-; V65-NEXT: r7 = #-4
+; V65-NEXT: r7:6 = combine(#64,#68)
; V65-NEXT: }
; V65-NEXT: {
-; V65-NEXT: v1:0 = vdeal(v1,v0,r7)
+; V65-NEXT: r5 = #120
; V65-NEXT: }
; V65-NEXT: {
-; V65-NEXT: v0.h = vpacko(v1.w,v0.w)
+; V65-NEXT: v1:0 = vshuff(v1,v0,r7)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v3:2 = vdeal(v0,v0,r6)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v31:30 = vdeal(v0,v1,r6)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v3:2 = vshuff(v3,v2,r5)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v1:0 = vshuff(v31,v30,r5)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v0.h = vpacko(v0.w,v2.w)
; V65-NEXT: }
; V65-NEXT: {
; V65-NEXT: jumpr r31
; V69-NEXT: v1:0.w = vmpy(v1.h,v0.h)
; V69-NEXT: }
; V69-NEXT: {
-; V69-NEXT: r7 = #-4
+; V69-NEXT: r7:6 = combine(#64,#68)
+; V69-NEXT: }
+; V69-NEXT: {
+; V69-NEXT: r5 = #120
+; V69-NEXT: }
+; V69-NEXT: {
+; V69-NEXT: v1:0 = vshuff(v1,v0,r7)
+; V69-NEXT: }
+; V69-NEXT: {
+; V69-NEXT: v3:2 = vdeal(v0,v0,r6)
+; V69-NEXT: }
+; V69-NEXT: {
+; V69-NEXT: v31:30 = vdeal(v0,v1,r6)
; V69-NEXT: }
; V69-NEXT: {
-; V69-NEXT: v1:0 = vdeal(v1,v0,r7)
+; V69-NEXT: v3:2 = vshuff(v3,v2,r5)
; V69-NEXT: }
; V69-NEXT: {
-; V69-NEXT: v0.h = vpacko(v1.w,v0.w)
+; V69-NEXT: v1:0 = vshuff(v31,v30,r5)
+; V69-NEXT: }
+; V69-NEXT: {
+; V69-NEXT: v0.h = vpacko(v0.w,v2.w)
; V69-NEXT: }
; V69-NEXT: {
; V69-NEXT: jumpr r31
; V60-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: r7 = #-4
+; V60-NEXT: r7:6 = combine(#64,#68)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: r5 = #120
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v1:0 = vshuff(v1,v0,r7)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v3:2 = vdeal(v0,v0,r6)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v31:30 = vdeal(v0,v1,r6)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v1:0 = vdeal(v1,v0,r7)
+; V60-NEXT: v3:2 = vshuff(v3,v2,r5)
; V60-NEXT: }
; V60-NEXT: {
-; V60-NEXT: v0.h = vpacko(v1.w,v0.w)
+; V60-NEXT: v1:0 = vshuff(v31,v30,r5)
+; V60-NEXT: }
+; V60-NEXT: {
+; V60-NEXT: v0.h = vpacko(v0.w,v2.w)
; V60-NEXT: }
; V60-NEXT: {
; V60-NEXT: jumpr r31
; V65-NEXT: v1:0.uw = vmpy(v1.uh,v0.uh)
; V65-NEXT: }
; V65-NEXT: {
-; V65-NEXT: r7 = #-4
+; V65-NEXT: r7:6 = combine(#64,#68)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: r5 = #120
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v1:0 = vshuff(v1,v0,r7)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v3:2 = vdeal(v0,v0,r6)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v31:30 = vdeal(v0,v1,r6)
+; V65-NEXT: }
+; V65-NEXT: {
+; V65-NEXT: v3:2 = vshuff(v3,v2,r5)
; V65-NEXT: }
; V65-NEXT: {
-; V65-NEXT: v1:0 = vdeal(v1,v0,r7)
+; V65-NEXT: v1:0 = vshuff(v31,v30,r5)
; V65-NEXT: }
; V65-NEXT: {
-; V65-NEXT: v0.h = vpacko(v1.w,v0.w)
+; V65-NEXT: v0.h = vpacko(v0.w,v2.w)
; V65-NEXT: }
; V65-NEXT: {
; V65-NEXT: jumpr r31
; CHECK-NEXT: v0 = vmem(r1+#0)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: r7 = #-4
+; CHECK-NEXT: r7 = #64
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r5:4 = combine(#68,#120)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r3 = #15
; CHECK-NEXT: v1:0.w = vmpy(v0.h,v1.h)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1:0 = vdeal(v1,v0,r7)
+; CHECK-NEXT: v1:0 = vshuff(v1,v0,r7)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.uw = vlsr(v0.uw,r3)
+; CHECK-NEXT: v3:2 = vdeal(v0,v0,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v31:30 = vdeal(v0,v1,r5)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v3:2 = vshuff(v3,v2,r4)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v1:0 = vshuff(v31,v30,r4)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v1.uw = vlsr(v1.uw,r3)
+; CHECK-NEXT: v1.uw = vlsr(v2.uw,r3)
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.uw = vlsr(v0.uw,r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
-; CHECK-NEXT: v0.h = vpacke(v1.w,v0.w)
+; CHECK-NEXT: v0.h = vpacke(v0.w,v1.w)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: vmem(r2+#0) = v0