; AVX-NEXT: pushq %rbx
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %r9
-; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rdx
; AVX-NEXT: movq %r9, %r8
; AVX-NEXT: shrq $56, %r8
; AVX-NEXT: andl $15, %r8d
; AVX-NEXT: movq %r9, %r10
; AVX-NEXT: shrq $48, %r10
; AVX-NEXT: andl $15, %r10d
-; AVX-NEXT: movq %rcx, %rdx
-; AVX-NEXT: shldq $24, %r9, %rdx
-; AVX-NEXT: andl $15, %edx
+; AVX-NEXT: movq %r9, %rsi
+; AVX-NEXT: shrq $40, %rsi
+; AVX-NEXT: andl $15, %esi
; AVX-NEXT: movq %r9, %r11
; AVX-NEXT: shrq $32, %r11
; AVX-NEXT: andl $15, %r11d
-; AVX-NEXT: movq %rcx, %rdi
+; AVX-NEXT: movq %rdx, %rdi
; AVX-NEXT: shrq $56, %rdi
; AVX-NEXT: andl $15, %edi
-; AVX-NEXT: movq %rcx, %rsi
-; AVX-NEXT: shrq $48, %rsi
-; AVX-NEXT: andl $15, %esi
-; AVX-NEXT: movq %rcx, %rax
-; AVX-NEXT: shrq $40, %rax
+; AVX-NEXT: movq %rdx, %rax
+; AVX-NEXT: shrq $48, %rax
; AVX-NEXT: andl $15, %eax
-; AVX-NEXT: movq %rcx, %rbx
+; AVX-NEXT: movq %rdx, %rcx
+; AVX-NEXT: shrq $40, %rcx
+; AVX-NEXT: andl $15, %ecx
+; AVX-NEXT: movq %rdx, %rbx
; AVX-NEXT: shrq $32, %rbx
; AVX-NEXT: andl $15, %ebx
; AVX-NEXT: shlq $32, %rbx
-; AVX-NEXT: andl $252645135, %ecx # imm = 0xF0F0F0F
-; AVX-NEXT: orq %rbx, %rcx
-; AVX-NEXT: shlq $40, %rax
+; AVX-NEXT: andl $252645135, %edx # imm = 0xF0F0F0F
+; AVX-NEXT: orq %rbx, %rdx
+; AVX-NEXT: shlq $40, %rcx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: shlq $48, %rax
; AVX-NEXT: orq %rcx, %rax
-; AVX-NEXT: shlq $48, %rsi
-; AVX-NEXT: orq %rax, %rsi
; AVX-NEXT: shlq $56, %rdi
-; AVX-NEXT: orq %rsi, %rdi
+; AVX-NEXT: orq %rax, %rdi
; AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; AVX-NEXT: shlq $32, %r11
; AVX-NEXT: andl $252645135, %r9d # imm = 0xF0F0F0F
; AVX-NEXT: orq %r11, %r9
-; AVX-NEXT: shlq $40, %rdx
-; AVX-NEXT: orq %r9, %rdx
+; AVX-NEXT: shlq $40, %rsi
+; AVX-NEXT: orq %r9, %rsi
; AVX-NEXT: shlq $48, %r10
-; AVX-NEXT: orq %rdx, %r10
+; AVX-NEXT: orq %rsi, %r10
; AVX-NEXT: shlq $56, %r8
; AVX-NEXT: orq %r10, %r8
; AVX-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX1: # %bb.0:
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: movq %rax, %r8
+; AVX1-NEXT: movq %rax, %rcx
; AVX1-NEXT: movq %rax, %rdx
; AVX1-NEXT: movq %rax, %rsi
; AVX1-NEXT: movq %rax, %rdi
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: andl $15, %ecx
-; AVX1-NEXT: shlq $32, %rcx
-; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; AVX1-NEXT: orq %rcx, %rax
-; AVX1-NEXT: shrq $40, %rdi
+; AVX1-NEXT: shrq $32, %rdi
; AVX1-NEXT: andl $15, %edi
-; AVX1-NEXT: shlq $40, %rdi
-; AVX1-NEXT: orq %rax, %rdi
-; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: shrq $48, %rsi
+; AVX1-NEXT: shlq $32, %rdi
+; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; AVX1-NEXT: orq %rdi, %rax
+; AVX1-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
+; AVX1-NEXT: shrq $40, %rsi
; AVX1-NEXT: andl $15, %esi
-; AVX1-NEXT: shlq $48, %rsi
-; AVX1-NEXT: orq %rdi, %rsi
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $56, %rdx
+; AVX1-NEXT: shlq $40, %rsi
+; AVX1-NEXT: orq %rax, %rsi
+; AVX1-NEXT: movq %rdi, %rax
+; AVX1-NEXT: shrq $48, %rdx
; AVX1-NEXT: andl $15, %edx
-; AVX1-NEXT: shlq $56, %rdx
+; AVX1-NEXT: shlq $48, %rdx
; AVX1-NEXT: orq %rsi, %rdx
-; AVX1-NEXT: movq %rax, %rsi
-; AVX1-NEXT: shldq $24, %rax, %r8
-; AVX1-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movq %rax, %rdx
-; AVX1-NEXT: shrq $32, %rdx
-; AVX1-NEXT: andl $15, %edx
-; AVX1-NEXT: shlq $32, %rdx
-; AVX1-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; AVX1-NEXT: orq %rdx, %rax
-; AVX1-NEXT: andl $15, %r8d
-; AVX1-NEXT: shlq $40, %r8
-; AVX1-NEXT: orq %rax, %r8
-; AVX1-NEXT: shrq $48, %rsi
-; AVX1-NEXT: andl $15, %esi
-; AVX1-NEXT: shlq $48, %rsi
-; AVX1-NEXT: orq %r8, %rsi
+; AVX1-NEXT: movq %rdi, %rsi
; AVX1-NEXT: shrq $56, %rcx
; AVX1-NEXT: andl $15, %ecx
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: shlq $56, %rcx
-; AVX1-NEXT: orq %rsi, %rcx
-; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: orq %rdx, %rcx
+; AVX1-NEXT: movq %rdi, %rdx
; AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $8, %ecx
-; AVX1-NEXT: vmovd %eax, %xmm1
-; AVX1-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $16, %ecx
-; AVX1-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movl %eax, %ecx
-; AVX1-NEXT: shrl $24, %ecx
-; AVX1-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: movq %rdi, %rcx
; AVX1-NEXT: shrq $32, %rcx
-; AVX1-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $40, %rcx
-; AVX1-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: movq %rax, %rcx
-; AVX1-NEXT: shrq $48, %rcx
-; AVX1-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX1-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX1-NEXT: andl $15, %ecx
+; AVX1-NEXT: shlq $32, %rcx
+; AVX1-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
+; AVX1-NEXT: orq %rcx, %rdi
+; AVX1-NEXT: shrq $40, %rdx
+; AVX1-NEXT: andl $15, %edx
+; AVX1-NEXT: shlq $40, %rdx
+; AVX1-NEXT: orq %rdi, %rdx
+; AVX1-NEXT: shrq $48, %rsi
+; AVX1-NEXT: andl $15, %esi
+; AVX1-NEXT: shlq $48, %rsi
+; AVX1-NEXT: orq %rdx, %rsi
; AVX1-NEXT: shrq $56, %rax
-; AVX1-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
+; AVX1-NEXT: andl $15, %eax
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT: shlq $56, %rax
+; AVX1-NEXT: orq %rsi, %rax
+; AVX1-NEXT: vmovq %xmm0, %rcx
+; AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX1-NEXT: movl %ecx, %eax
; AVX1-NEXT: shrl $8, %eax
-; AVX1-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX1-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vmovd %ecx, %xmm1
+; AVX1-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; AVX1-NEXT: movl %ecx, %eax
; AVX1-NEXT: shrl $16, %eax
-; AVX1-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; AVX1-NEXT: movl %ecx, %eax
; AVX1-NEXT: shrl $24, %eax
-; AVX1-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: shrq $32, %rax
-; AVX1-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: shrq $40, %rax
-; AVX1-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; AVX1-NEXT: movq %rcx, %rax
; AVX1-NEXT: shrq $48, %rax
-; AVX1-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
; AVX1-NEXT: shrq $56, %rcx
-; AVX1-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $8, %ecx
+; AVX1-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $16, %ecx
+; AVX1-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movl %eax, %ecx
+; AVX1-NEXT: shrl $24, %ecx
+; AVX1-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $32, %rcx
+; AVX1-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $40, %rcx
+; AVX1-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: movq %rax, %rcx
+; AVX1-NEXT: shrq $48, %rcx
+; AVX1-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT: shrq $56, %rax
+; AVX1-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX1-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq %rax, %rcx
; AVX2-NEXT: movq %rax, %rdx
; AVX2-NEXT: movq %rax, %rsi
; AVX2-NEXT: movq %rax, %rdi
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: shlq $32, %rcx
-; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: shrq $40, %rdi
+; AVX2-NEXT: shrq $32, %rdi
; AVX2-NEXT: andl $15, %edi
-; AVX2-NEXT: shlq $40, %rdi
-; AVX2-NEXT: orq %rax, %rdi
-; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: shrq $48, %rsi
+; AVX2-NEXT: shlq $32, %rdi
+; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
+; AVX2-NEXT: orq %rdi, %rax
+; AVX2-NEXT: movq -{{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: shrq $40, %rsi
; AVX2-NEXT: andl $15, %esi
-; AVX2-NEXT: shlq $48, %rsi
-; AVX2-NEXT: orq %rdi, %rsi
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $56, %rdx
+; AVX2-NEXT: shlq $40, %rsi
+; AVX2-NEXT: orq %rax, %rsi
+; AVX2-NEXT: movq %rdi, %rax
+; AVX2-NEXT: shrq $48, %rdx
; AVX2-NEXT: andl $15, %edx
-; AVX2-NEXT: shlq $56, %rdx
+; AVX2-NEXT: shlq $48, %rdx
; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: shldq $24, %rax, %r8
-; AVX2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movq %rax, %rdx
-; AVX2-NEXT: shrq $32, %rdx
-; AVX2-NEXT: andl $15, %edx
-; AVX2-NEXT: shlq $32, %rdx
-; AVX2-NEXT: andl $252645135, %eax # imm = 0xF0F0F0F
-; AVX2-NEXT: orq %rdx, %rax
-; AVX2-NEXT: andl $15, %r8d
-; AVX2-NEXT: shlq $40, %r8
-; AVX2-NEXT: orq %rax, %r8
-; AVX2-NEXT: shrq $48, %rsi
-; AVX2-NEXT: andl $15, %esi
-; AVX2-NEXT: shlq $48, %rsi
-; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: movq %rdi, %rsi
; AVX2-NEXT: shrq $56, %rcx
; AVX2-NEXT: andl $15, %ecx
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: shlq $56, %rcx
-; AVX2-NEXT: orq %rsi, %rcx
-; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movq %rdi, %rdx
; AVX2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $8, %ecx
-; AVX2-NEXT: vmovd %eax, %xmm1
-; AVX2-NEXT: vpinsrb $1, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $16, %ecx
-; AVX2-NEXT: vpinsrb $2, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movl %eax, %ecx
-; AVX2-NEXT: shrl $24, %ecx
-; AVX2-NEXT: vpinsrb $3, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: movq %rdi, %rcx
; AVX2-NEXT: shrq $32, %rcx
-; AVX2-NEXT: vpinsrb $4, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $40, %rcx
-; AVX2-NEXT: vpinsrb $5, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: movq %rax, %rcx
-; AVX2-NEXT: shrq $48, %rcx
-; AVX2-NEXT: vpinsrb $6, %ecx, %xmm1, %xmm1
-; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: andl $15, %ecx
+; AVX2-NEXT: shlq $32, %rcx
+; AVX2-NEXT: andl $252645135, %edi # imm = 0xF0F0F0F
+; AVX2-NEXT: orq %rcx, %rdi
+; AVX2-NEXT: shrq $40, %rdx
+; AVX2-NEXT: andl $15, %edx
+; AVX2-NEXT: shlq $40, %rdx
+; AVX2-NEXT: orq %rdi, %rdx
+; AVX2-NEXT: shrq $48, %rsi
+; AVX2-NEXT: andl $15, %esi
+; AVX2-NEXT: shlq $48, %rsi
+; AVX2-NEXT: orq %rdx, %rsi
; AVX2-NEXT: shrq $56, %rax
-; AVX2-NEXT: vpinsrb $7, %eax, %xmm1, %xmm0
+; AVX2-NEXT: andl $15, %eax
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT: shlq $56, %rax
+; AVX2-NEXT: orq %rsi, %rax
+; AVX2-NEXT: vmovq %xmm0, %rcx
+; AVX2-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: shrl $8, %eax
-; AVX2-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0
-; AVX2-NEXT: vpinsrb $9, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vmovd %ecx, %xmm1
+; AVX2-NEXT: vpinsrb $1, %eax, %xmm1, %xmm1
; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: shrl $16, %eax
-; AVX2-NEXT: vpinsrb $10, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $2, %eax, %xmm1, %xmm1
; AVX2-NEXT: movl %ecx, %eax
; AVX2-NEXT: shrl $24, %eax
-; AVX2-NEXT: vpinsrb $11, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $3, %eax, %xmm1, %xmm1
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: shrq $32, %rax
-; AVX2-NEXT: vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $4, %eax, %xmm1, %xmm1
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: shrq $40, %rax
-; AVX2-NEXT: vpinsrb $13, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $5, %eax, %xmm1, %xmm1
; AVX2-NEXT: movq %rcx, %rax
; AVX2-NEXT: shrq $48, %rax
-; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
; AVX2-NEXT: shrq $56, %rcx
-; AVX2-NEXT: vpinsrb $15, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $7, %ecx, %xmm1, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $8, %ecx
+; AVX2-NEXT: vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $16, %ecx
+; AVX2-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movl %eax, %ecx
+; AVX2-NEXT: shrl $24, %ecx
+; AVX2-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $32, %rcx
+; AVX2-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $40, %rcx
+; AVX2-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: shrq $48, %rcx
+; AVX2-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT: shrq $56, %rax
+; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa -{{[0-9]+}}(%rsp), %xmm1
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX2-NEXT: retq
; X86-NEXT: movl 24(%ebp), %edx
; X86-NEXT: movl 40(%ebp), %edi
; X86-NEXT: leal {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: sarl $31, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: sarl $31, %ecx
; X86-NEXT: addl %edx, %edx
-; X86-NEXT: adcl %ecx, %ecx
-; X86-NEXT: andl $1, %ecx
-; X86-NEXT: negl %ecx
+; X86-NEXT: adcl %eax, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: shldl $31, %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: shldl $31, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll $31, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %eax
+; X86-NEXT: negl %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %eax
; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %ecx
-; X86-NEXT: pushl %esi
; X86-NEXT: pushl %edx
; X86-NEXT: pushl %ebx
; X86-NEXT: calll __modti3
; X86-NEXT: addl $32, %esp
; X86-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 36(%ebp), %edx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl 36(%ebp), %esi
+; X86-NEXT: movl %esi, %edi
; X86-NEXT: sarl $31, %edi
; X86-NEXT: movl 20(%ebp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: sarl $31, %esi
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: sarl $31, %edx
; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: adcl %esi, %esi
-; X86-NEXT: andl $1, %esi
-; X86-NEXT: negl %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: adcl %edx, %edx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: shldl $31, %ecx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: shll $31, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: negl %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %edi
-; X86-NEXT: pushl %edx
-; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %edx
+; X86-NEXT: pushl %edx
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %ecx
; X86-NEXT: pushl %eax
; X86-NEXT: movl 28(%ebp), %ebx
; X86-NEXT: movl %ebx, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: sarl $31, %esi
+; X86-NEXT: addl %eax, %eax
+; X86-NEXT: adcl %esi, %esi
; X86-NEXT: movl %esi, %ecx
-; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: addl %esi, %esi
-; X86-NEXT: adcl %ecx, %ecx
-; X86-NEXT: andl $1, %ecx
-; X86-NEXT: negl %ecx
+; X86-NEXT: shldl $31, %eax, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shldl $31, %esi, %eax
+; X86-NEXT: shll $31, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shll $31, %esi
+; X86-NEXT: andl $1, %esi
+; X86-NEXT: negl %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: pushl %edx
; X86-NEXT: pushl %edx
; X86-NEXT: pushl %edx
; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %ecx
+; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %esi
; X86-NEXT: pushl %ecx
; X86-NEXT: pushl %eax
-; X86-NEXT: pushl %esi
; X86-NEXT: pushl %edi
; X86-NEXT: calll __divti3
; X86-NEXT: addl $32, %esp
; X86-NEXT: sarl $31, %ebx
; X86-NEXT: addl %ecx, %ecx
; X86-NEXT: adcl %ebx, %ebx
-; X86-NEXT: andl $1, %ebx
-; X86-NEXT: negl %ebx
; X86-NEXT: movl %ebx, %edi
; X86-NEXT: shldl $31, %ecx, %edi
; X86-NEXT: shll $31, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $1, %ebx
+; X86-NEXT: negl %ebx
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: sbbl $0, %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl $0, %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl $0, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl $0, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: testl %edi, %edi
-; X86-NEXT: sets %cl
+; X86-NEXT: sets %al
; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT: sets %ch
-; X86-NEXT: xorb %cl, %ch
+; X86-NEXT: sets %ah
+; X86-NEXT: xorb %al, %ah
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: orl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: orl %edi, %edx
-; X86-NEXT: setne %cl
-; X86-NEXT: testb %ch, %cl
-; X86-NEXT: cmovel %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: setne %al
+; X86-NEXT: testb %ah, %al
+; X86-NEXT: cmovel %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: subl $1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: sbbl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: cmpl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: sets %bh
; X86-NEXT: xorb %bl, %bh
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: orl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: orl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: setne %al
; X86-NEXT: testb %bh, %al
-; X86-NEXT: cmovel %edi, %ecx
+; X86-NEXT: cmovel %esi, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload