This can be useful since addition is commutable, and subtraction is not.
This matches a transform that is also done by InstCombine.
llvm-svn: 338181
return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
N0.getOperand(1).getOperand(0));
+ // fold (A-(B-C)) -> A+(C-B)
+ if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
+ return DAG.getNode(ISD::ADD, DL, VT, N0,
+ DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
+ N1.getOperand(0)));
+
// fold (X - (-Y * Z)) -> (X + (Y * Z))
if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
if (N1.getOperand(0).getOpcode() == ISD::SUB &&
; CHECK-NOT: sub
; CHECK: b.ge
; CHECK: sub
-; CHECK: sub
+; CHECK-NEXT: add
; CHECK-NOT: sub
; CHECK: ret
%0 = load i32, i32* %offset, align 4
%s2 = sub nsw i32 %s, %size
%s3 = sub nsw i32 %sub, %s2
; CHECK: sub [[R1:r[0-9]+]], [[R2:r[0-9]+]], r2
-; CHECK: sub [[R3:r[0-9]+]], [[R1]], r2
-; CHECK: sub [[R4:r[0-9]+]], [[R1]], [[R3]]
+; CHECK: sub [[R3:r[0-9]+]], r2, [[R1]]
+; CHECK: add [[R4:r[0-9]+]], [[R1]], [[R3]]
; CHECK-NOT: sub
; CHECK: str
store i32 %s3, i32* %offset, align 4
; MIPS32-NEXT: sll $2, $4, 4
; MIPS32-NEXT: subu $1, $2, $1
; MIPS32-NEXT: sll $2, $4, 6
-; MIPS32-NEXT: subu $1, $2, $1
+; MIPS32-NEXT: subu $1, $1, $2
; MIPS32-NEXT: sll $2, $4, 8
-; MIPS32-NEXT: subu $1, $2, $1
+; MIPS32-NEXT: addu $1, $2, $1
; MIPS32-NEXT: sll $2, $4, 10
-; MIPS32-NEXT: subu $1, $2, $1
+; MIPS32-NEXT: subu $1, $1, $2
; MIPS32-NEXT: sll $2, $4, 13
-; MIPS32-NEXT: subu $1, $2, $1
+; MIPS32-NEXT: addu $1, $2, $1
; MIPS32-NEXT: sll $2, $4, 16
-; MIPS32-NEXT: subu $1, $2, $1
+; MIPS32-NEXT: subu $1, $1, $2
; MIPS32-NEXT: sll $2, $4, 24
; MIPS32-NEXT: sll $3, $4, 22
; MIPS32-NEXT: sll $5, $4, 20
; MIPS32-NEXT: sll $4, $4, 18
-; MIPS32-NEXT: subu $1, $4, $1
+; MIPS32-NEXT: addu $1, $4, $1
; MIPS32-NEXT: addu $1, $5, $1
; MIPS32-NEXT: addu $1, $3, $1
; MIPS32-NEXT: jr $ra
; MIPS64-NEXT: sll $3, $1, 4
; MIPS64-NEXT: subu $2, $3, $2
; MIPS64-NEXT: sll $3, $1, 6
-; MIPS64-NEXT: subu $2, $3, $2
+; MIPS64-NEXT: subu $2, $2, $3
; MIPS64-NEXT: sll $3, $1, 8
-; MIPS64-NEXT: subu $2, $3, $2
+; MIPS64-NEXT: addu $2, $3, $2
; MIPS64-NEXT: sll $3, $1, 10
-; MIPS64-NEXT: subu $2, $3, $2
+; MIPS64-NEXT: subu $2, $2, $3
; MIPS64-NEXT: sll $3, $1, 13
-; MIPS64-NEXT: subu $2, $3, $2
+; MIPS64-NEXT: addu $2, $3, $2
; MIPS64-NEXT: sll $3, $1, 16
-; MIPS64-NEXT: subu $2, $3, $2
+; MIPS64-NEXT: subu $2, $2, $3
; MIPS64-NEXT: sll $3, $1, 24
; MIPS64-NEXT: sll $4, $1, 22
; MIPS64-NEXT: sll $5, $1, 20
; MIPS64-NEXT: sll $1, $1, 18
-; MIPS64-NEXT: subu $1, $1, $2
+; MIPS64-NEXT: addu $1, $1, $2
; MIPS64-NEXT: addu $1, $5, $1
; MIPS64-NEXT: addu $1, $4, $1
; MIPS64-NEXT: jr $ra
; MIPS32-NEXT: sll $4, $5, 4
; MIPS32-NEXT: subu $3, $4, $3
; MIPS32-NEXT: sll $4, $5, 6
-; MIPS32-NEXT: subu $3, $4, $3
+; MIPS32-NEXT: subu $3, $3, $4
; MIPS32-NEXT: sll $4, $5, 8
-; MIPS32-NEXT: subu $3, $4, $3
+; MIPS32-NEXT: addu $3, $4, $3
; MIPS32-NEXT: sll $4, $5, 10
-; MIPS32-NEXT: subu $3, $4, $3
+; MIPS32-NEXT: subu $3, $3, $4
; MIPS32-NEXT: sll $4, $5, 13
-; MIPS32-NEXT: subu $3, $4, $3
+; MIPS32-NEXT: addu $3, $4, $3
; MIPS32-NEXT: sll $4, $5, 16
-; MIPS32-NEXT: subu $3, $4, $3
+; MIPS32-NEXT: subu $3, $3, $4
; MIPS32-NEXT: sll $4, $5, 24
; MIPS32-NEXT: sll $6, $5, 22
; MIPS32-NEXT: sll $7, $5, 20
; MIPS32-NEXT: sll $5, $5, 18
-; MIPS32-NEXT: subu $3, $5, $3
+; MIPS32-NEXT: addu $3, $5, $3
; MIPS32-NEXT: addu $3, $7, $3
; MIPS32-NEXT: addu $3, $6, $3
; MIPS32-NEXT: addu $3, $4, $3
; MIPS64-NEXT: dsll $2, $4, 4
; MIPS64-NEXT: dsubu $1, $2, $1
; MIPS64-NEXT: dsll $2, $4, 6
-; MIPS64-NEXT: dsubu $1, $2, $1
+; MIPS64-NEXT: dsubu $1, $1, $2
; MIPS64-NEXT: dsll $2, $4, 8
-; MIPS64-NEXT: dsubu $1, $2, $1
+; MIPS64-NEXT: daddu $1, $2, $1
; MIPS64-NEXT: dsll $2, $4, 10
-; MIPS64-NEXT: dsubu $1, $2, $1
+; MIPS64-NEXT: dsubu $1, $1, $2
; MIPS64-NEXT: dsll $2, $4, 13
-; MIPS64-NEXT: dsubu $1, $2, $1
+; MIPS64-NEXT: daddu $1, $2, $1
; MIPS64-NEXT: dsll $2, $4, 16
-; MIPS64-NEXT: dsubu $1, $2, $1
+; MIPS64-NEXT: dsubu $1, $1, $2
; MIPS64-NEXT: dsll $2, $4, 24
; MIPS64-NEXT: dsll $3, $4, 22
; MIPS64-NEXT: dsll $5, $4, 20
; MIPS64-NEXT: dsll $4, $4, 18
-; MIPS64-NEXT: dsubu $1, $4, $1
+; MIPS64-NEXT: daddu $1, $4, $1
; MIPS64-NEXT: daddu $1, $5, $1
; MIPS64-NEXT: daddu $1, $3, $1
; MIPS64-NEXT: jr $ra
; CHECK-NEXT: addl %eax, %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shll $8, %eax
-; CHECK-NEXT: subl %edx, %eax
-; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: subl %eax, %edx
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: retl
%tmp1 = srem i32 %X, 255
; CHECK-NEXT: shrl $7, %edx
; CHECK-NEXT: movl %edx, %eax
; CHECK-NEXT: shll $8, %eax
-; CHECK-NEXT: subl %edx, %eax
-; CHECK-NEXT: subl %eax, %ecx
+; CHECK-NEXT: subl %eax, %edx
+; CHECK-NEXT: addl %edx, %ecx
; CHECK-NEXT: movl %ecx, %eax
; CHECK-NEXT: retl
%tmp1 = urem i32 %X, 255
; SSE2-NEXT: sarq %rdx
; SSE2-NEXT: addq %rax, %rdx
; SSE2-NEXT: leaq (,%rdx,8), %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: subq %rax, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: sarq %rdx
; SSE2-NEXT: addq %rax, %rdx
; SSE2-NEXT: leaq (,%rdx,8), %rax
-; SSE2-NEXT: subq %rdx, %rax
-; SSE2-NEXT: subq %rax, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: subq %rax, %rdx
+; SSE2-NEXT: addq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE41-NEXT: sarq %rdx
; SSE41-NEXT: addq %rax, %rdx
; SSE41-NEXT: leaq (,%rdx,8), %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: subq %rax, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm1
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: imulq %rsi
; SSE41-NEXT: sarq %rdx
; SSE41-NEXT: addq %rax, %rdx
; SSE41-NEXT: leaq (,%rdx,8), %rax
-; SSE41-NEXT: subq %rdx, %rax
-; SSE41-NEXT: subq %rax, %rcx
-; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: subq %rax, %rdx
+; SSE41-NEXT: addq %rcx, %rdx
+; SSE41-NEXT: movq %rdx, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%res = srem <2 x i64> %a, <i64 7, i64 7>
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: imulq %rsi
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: imulq %rsi
; AVX1-NEXT: sarq %rdx
; AVX1-NEXT: addq %rax, %rdx
; AVX1-NEXT: leaq (,%rdx,8), %rax
-; AVX1-NEXT: subq %rdx, %rax
-; AVX1-NEXT: subq %rax, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: subq %rax, %rdx
+; AVX1-NEXT: addq %rcx, %rdx
+; AVX1-NEXT: vmovq %rdx, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: imulq %rsi
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: imulq %rsi
; AVX2-NEXT: sarq %rdx
; AVX2-NEXT: addq %rax, %rdx
; AVX2-NEXT: leaq (,%rdx,8), %rax
-; AVX2-NEXT: subq %rdx, %rax
-; AVX2-NEXT: subq %rax, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: subq %rax, %rdx
+; AVX2-NEXT: addq %rcx, %rdx
+; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vmovq %xmm1, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm3
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: imulq %rsi
; AVX-NEXT: sarq %rdx
; AVX-NEXT: addq %rax, %rdx
; AVX-NEXT: leaq (,%rdx,8), %rax
-; AVX-NEXT: subq %rdx, %rax
-; AVX-NEXT: subq %rax, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rax, %rdx
+; AVX-NEXT: addq %rcx, %rdx
+; AVX-NEXT: vmovq %rdx, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; SSE2-NEXT: addq %rdx, %rax
; SSE2-NEXT: shrq $2, %rax
; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rax, %rdx
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm1
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: addq %rcx, %rax
+; SSE2-NEXT: movq %rax, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
; SSE2-NEXT: movq %xmm0, %rcx
; SSE2-NEXT: movq %rcx, %rax
; SSE2-NEXT: addq %rdx, %rax
; SSE2-NEXT: shrq $2, %rax
; SSE2-NEXT: leaq (,%rax,8), %rdx
-; SSE2-NEXT: subq %rax, %rdx
-; SSE2-NEXT: subq %rdx, %rcx
-; SSE2-NEXT: movq %rcx, %xmm0
+; SSE2-NEXT: subq %rdx, %rax
+; SSE2-NEXT: addq %rcx, %rax
+; SSE2-NEXT: movq %rax, %xmm0
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: retq
; SSE41-NEXT: addq %rdx, %rax
; SSE41-NEXT: shrq $2, %rax
; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rax, %rdx
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm1
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: addq %rcx, %rax
+; SSE41-NEXT: movq %rax, %xmm1
; SSE41-NEXT: movq %xmm0, %rcx
; SSE41-NEXT: movq %rcx, %rax
; SSE41-NEXT: mulq %rsi
; SSE41-NEXT: addq %rdx, %rax
; SSE41-NEXT: shrq $2, %rax
; SSE41-NEXT: leaq (,%rax,8), %rdx
-; SSE41-NEXT: subq %rax, %rdx
-; SSE41-NEXT: subq %rdx, %rcx
-; SSE41-NEXT: movq %rcx, %xmm0
+; SSE41-NEXT: subq %rdx, %rax
+; SSE41-NEXT: addq %rcx, %rax
+; SSE41-NEXT: movq %rax, %xmm0
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE41-NEXT: retq
;
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%res = urem <2 x i64> %a, <i64 7, i64 7>
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm1, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm1
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm2
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm2
; AVX1-NEXT: vmovq %xmm0, %rcx
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: mulq %rsi
; AVX1-NEXT: addq %rdx, %rax
; AVX1-NEXT: shrq $2, %rax
; AVX1-NEXT: leaq (,%rax,8), %rdx
-; AVX1-NEXT: subq %rax, %rdx
-; AVX1-NEXT: subq %rdx, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
+; AVX1-NEXT: subq %rdx, %rax
+; AVX1-NEXT: addq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vmovq %xmm1, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm1
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm2
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vmovq %xmm0, %rcx
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: mulq %rsi
; AVX2-NEXT: addq %rdx, %rax
; AVX2-NEXT: shrq $2, %rax
; AVX2-NEXT: leaq (,%rax,8), %rdx
-; AVX2-NEXT: subq %rax, %rdx
-; AVX2-NEXT: subq %rdx, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
+; AVX2-NEXT: subq %rdx, %rax
+; AVX2-NEXT: addq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
; AVX-NEXT: vmovq %xmm1, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm1
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX-NEXT: vextracti32x4 $2, %zmm0, %xmm2
; AVX-NEXT: vpextrq $1, %xmm2, %rcx
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm3
; AVX-NEXT: vmovq %xmm2, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm2
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; AVX-NEXT: vpextrq $1, %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm3
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm3
; AVX-NEXT: vmovq %xmm0, %rcx
; AVX-NEXT: movq %rcx, %rax
; AVX-NEXT: mulq %rsi
; AVX-NEXT: addq %rdx, %rax
; AVX-NEXT: shrq $2, %rax
; AVX-NEXT: leaq (,%rax,8), %rdx
-; AVX-NEXT: subq %rax, %rdx
-; AVX-NEXT: subq %rdx, %rcx
-; AVX-NEXT: vmovq %rcx, %xmm0
+; AVX-NEXT: subq %rdx, %rax
+; AVX-NEXT: addq %rcx, %rax
+; AVX-NEXT: vmovq %rax, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
; AVX-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0