From 0959ffc97b738c489087bcf45578c1580a87e66d Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Thu, 2 Sep 2010 23:36:25 -0700
Subject: [PATCH] Update x86-64 mpn routines from GMP 5.0.1.

---
 ChangeLog                 |  20 +++++--
 sysdeps/x86_64/add_n.S    |  99 ++++++++++++++++++++++++++++-------
 sysdeps/x86_64/addmul_1.S | 115 ++++++++++++++++++++++++++++++++---------
 sysdeps/x86_64/lshift.S   | 127 ++++++++++++++++++++++++++++++++-------------
 sysdeps/x86_64/mul_1.S    | 119 ++++++++++++++++++++++++++++++++++++------
 sysdeps/x86_64/rshift.S   | 129 ++++++++++++++++++++++++++++++++--------------
 sysdeps/x86_64/sub_n.S    |  28 ++--------
 sysdeps/x86_64/submul_1.S |  32 ++----------
 8 files changed, 482 insertions(+), 187 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index fd4b775..fe512db 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,12 +1,22 @@
+2010-09-02  Ulrich Drepper  <drepper@redhat.com>
+
+	* sysdeps/x86_64/add_n.S: Update from GMP 5.0.1.
+	* sysdeps/x86_64/addmul_1.S: Likewise.
+	* sysdeps/x86_64/lshift.S: Likewise.
+	* sysdeps/x86_64/mul_1.S: Likewise.
+	* sysdeps/x86_64/rshift.S: Likewise.
+	* sysdeps/x86_64/sub_n.S: Likewise.
+	* sysdeps/x86_64/submul_1.S: Likewise.
+
 2010-09-01  Samuel Thibault  <samuel.thibault@ens-lyon.org>
 
-        This aligns bits/sched.h onto sysdeps/unix/sysv/linux/bits/sched.h:
-        Define __sched_param instead of SCHED_* and sched_param when
+	This aligns bits/sched.h onto sysdeps/unix/sysv/linux/bits/sched.h:
+	Define __sched_param instead of SCHED_* and sched_param when
 	<bits/sched.h> is included with __need_schedparam defined.
-        * bits/sched.h [__need_schedparam]
+	* bits/sched.h [__need_schedparam]
 	(SCHED_OTHER, SCHED_FIFO, SCHED_RR, sched_param): Do not define.
-        [!__defined_schedparam && (__need_schedparam || _SCHED_H)]
-        (__defined_schedparam): Define to 1.
+	[!__defined_schedparam && (__need_schedparam || _SCHED_H)]
+	(__defined_schedparam): Define to 1.
 	(__sched_param): New structure, identical to sched_param.
 	(__need_schedparam): Undefine.
 
diff --git a/sysdeps/x86_64/add_n.S b/sysdeps/x86_64/add_n.S
index 7883f6c..f0b4c3f 100644
--- a/sysdeps/x86_64/add_n.S
+++ b/sysdeps/x86_64/add_n.S
@@ -1,6 +1,6 @@
-/* Add two limb vectors of the same length > 0 and store sum in a third
-   limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+/* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
+   sum in a third limb vector.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -21,22 +21,81 @@
 #include "sysdep.h"
 #include "asm-syntax.h"
 
+#define rp	%rdi
+#define up	%rsi
+#define vp	%rdx
+#define n	%rcx
+#define cy	%r8
+
+#ifndef func
+# define func __mpn_add_n
+# define ADCSBB adc
+#endif
+
 	.text
-ENTRY (__mpn_add_n)
-	leaq	(%rsi,%rcx,8), %rsi
-	leaq	(%rdi,%rcx,8), %rdi
-	leaq	(%rdx,%rcx,8), %rdx
-	negq	%rcx
-	xorl	%eax, %eax			# clear cy
-	.p2align 2
-L(loop):
-	movq	(%rsi,%rcx,8), %rax
-	movq	(%rdx,%rcx,8), %r10
-	adcq	%r10, %rax
-	movq	%rax, (%rdi,%rcx,8)
-	incq	%rcx
-	jne	L(loop)
-	movq	%rcx, %rax			# zero %rax
-	adcq	%rax, %rax
+ENTRY (func)
+	xor	%r8, %r8
+	mov	(up), %r10
+	mov	(vp), %r11
+
+	lea	-8(up,n,8), up
+	lea	-8(vp,n,8), vp
+	lea	-16(rp,n,8), rp
+	mov	%ecx, %eax
+	neg	n
+	and	$3, %eax
+	je	L(b00)
+	add	%rax, n		/* clear low rcx bits for jrcxz */
+	cmp	$2, %eax
+	jl	L(b01)
+	je	L(b10)
+
+L(b11):	shr	%r8		/* set cy */
+	jmp	L(e11)
+
+L(b00):	shr	%r8		/* set cy */
+	mov	%r10, %r8
+	mov	%r11, %r9
+	lea	4(n), n
+	jmp	L(e00)
+
+L(b01):	shr	%r8		/* set cy */
+	jmp	L(e01)
+
+L(b10):	shr	%r8		/* set cy */
+	mov	%r10, %r8
+	mov	%r11, %r9
+	jmp	L(e10)
+
+L(end):	ADCSBB	%r11, %r10
+	mov	%r10, 8(rp)
+	mov	%ecx, %eax	/* clear eax, ecx contains 0 */
+	adc	%eax, %eax
 	ret
-END (__mpn_add_n)
+
+	.p2align 4
+L(top):
+	mov	-24(up,n,8), %r8
+	mov	-24(vp,n,8), %r9
+	ADCSBB	%r11, %r10
+	mov	%r10, -24(rp,n,8)
+L(e00):
+	mov	-16(up,n,8), %r10
+	mov	-16(vp,n,8), %r11
+	ADCSBB	%r9, %r8
+	mov	%r8, -16(rp,n,8)
+L(e11):
+	mov	-8(up,n,8), %r8
+	mov	-8(vp,n,8), %r9
+	ADCSBB	%r11, %r10
+	mov	%r10, -8(rp,n,8)
+L(e10):
+	mov	(up,n,8), %r10
+	mov	(vp,n,8), %r11
+	ADCSBB	%r9, %r8
+	mov	%r8, (rp,n,8)
+L(e01):
+	jrcxz	L(end)
+	lea	4(n), n
+	jmp	L(top)
+END (func)
diff --git a/sysdeps/x86_64/addmul_1.S b/sysdeps/x86_64/addmul_1.S
index bdb5226..e997896 100644
--- a/sysdeps/x86_64/addmul_1.S
+++ b/sysdeps/x86_64/addmul_1.S
@@ -1,6 +1,6 @@
-/* AMD64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
+/* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
    the result to a second limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -21,26 +21,95 @@
 #include "sysdep.h"
 #include "asm-syntax.h"
 
+#define rp	%rdi
+#define up	%rsi
+#define n	%rdx
+#define v0	%rcx
+
+#ifndef func
+# define func __mpn_addmul_1
+# define ADDSUB add
+#endif
+
 	.text
-ENTRY (__mpn_addmul_1)
-	movq	%rdx, %r11
-	leaq	(%rsi,%rdx,8), %rsi
-	leaq	(%rdi,%rdx,8), %rdi
-	negq	%r11
-	xorl	%r8d, %r8d
-	xorl	%r10d, %r10d
-	.p2align 2
-L(loop):
-	movq	(%rsi,%r11,8), %rax
-	mulq	%rcx
-	addq	(%rdi,%r11,8), %rax
-	adcq	%r10, %rdx
-	addq	%r8, %rax
-	movq	%r10, %r8
-	movq	%rax, (%rdi,%r11,8)
-	adcq	%rdx, %r8
-	incq	%r11
-	jne	L(loop)
-	movq	%r8, %rax
+ENTRY (func)
+	push	%rbx
+	push	%rbp
+	lea	(%rdx), %rbx
+	neg	%rbx
+
+	mov	(up), %rax
+	mov	(rp), %r10
+
+	lea	-16(rp,%rdx,8), rp
+	lea	(up,%rdx,8), up
+	mul	%rcx
+
+	bt	$0, %ebx
+	jc	L(odd)
+
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	lea	(%rdx), %rbp
+	mul	%rcx
+	add	$2, %rbx
+	jns	L(n2)
+
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	lea	(%rdx), %r9
+	jmp	L(mid)
+
+L(odd):	add	$1, %rbx
+	jns	L(n1)
+
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	lea	(%rdx), %r9
+	mul	%rcx
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	lea	(%rdx), %rbp
+	jmp	L(e)
+
+	.p2align 4
+L(top):	mul	%rcx
+	ADDSUB	%r8, %r10
+	lea	(%rax), %r8
+	mov	(up,%rbx,8), %rax
+	adc	%r9, %r11
+	mov	%r10, -8(rp,%rbx,8)
+	mov	(rp,%rbx,8), %r10
+	lea	(%rdx), %r9
+	adc	$0, %rbp
+L(mid):	mul	%rcx
+	ADDSUB	%r11, %r10
+	lea	(%rax), %r11
+	mov	8(up,%rbx,8), %rax
+	adc	%rbp, %r8
+	mov	%r10, (rp,%rbx,8)
+	mov	8(rp,%rbx,8), %r10
+	lea	(%rdx), %rbp
+	adc	$0, %r9
+L(e):	add	$2, %rbx
+	js	L(top)
+
+	mul	%rcx
+	ADDSUB	%r8, %r10
+	adc	%r9, %r11
+	mov	%r10, -8(rp)
+	adc	$0, %rbp
+L(n2):	mov	(rp), %r10
+	ADDSUB	%r11, %r10
+	adc	%rbp, %rax
+	mov	%r10, (rp)
+	adc	$0, %rdx
+L(n1):	mov	8(rp), %r10
+	ADDSUB	%rax, %r10
+	mov	%r10, 8(rp)
+	mov	%ebx, %eax	/* zero rax */
+	adc	%rdx, %rax
+	pop	%rbp
+	pop	%rbx
 	ret
-END (__mpn_addmul_1)
+END (func)
diff --git a/sysdeps/x86_64/lshift.S b/sysdeps/x86_64/lshift.S
index 5ac66f0..f89d3e0 100644
--- a/sysdeps/x86_64/lshift.S
+++ b/sysdeps/x86_64/lshift.S
@@ -1,5 +1,5 @@
-/* AMD64 __mpn_lshift --
-   Copyright 2004, 2006 Free Software Foundation, Inc.
+/* x86-64 __mpn_lshift --
+   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -20,41 +20,98 @@
 #include "sysdep.h"
 #include "asm-syntax.h"
 
+#define rp	%rdi
+#define up	%rsi
+#define n	%rdx
+#define cnt	%cl
 
 	.text
 ENTRY (__mpn_lshift)
-	movq	-8(%rsi,%rdx,8), %mm7
-	movd	%ecx, %mm1
-	movl	$64, %eax
-	subl	%ecx, %eax
-	movd	%eax, %mm0
-	movq	%mm7, %mm3
-	psrlq	%mm0, %mm7
-	movd	%mm7, %rax
-	subq	$2, %rdx
-	jl	L(endo)
-	.p2align 2
-L(loop):
-	movq	(%rsi,%rdx,8), %mm6
-	movq	%mm6, %mm2
-	psrlq	%mm0, %mm6
-	psllq	%mm1, %mm3
-	por	%mm6, %mm3
-	movq	%mm3, 8(%rdi,%rdx,8)
-	je	L(ende)
-	movq	-8(%rsi,%rdx,8), %mm7
-	movq	%mm7, %mm3
-	psrlq	%mm0, %mm7
-	psllq	%mm1, %mm2
-	por	%mm7, %mm2
-	movq	%mm2, (%rdi,%rdx,8)
-	subq	$2, %rdx
-	jge	L(loop)
-L(endo):
-	movq	%mm3, %mm2
-L(ende):
-	psllq	%mm1, %mm2
-	movq	%mm2, (%rdi)
-	emms
+	lea	-8(rp,n,8), rp
+	lea	-8(up,n,8), up
+
+	mov	%edx, %eax
+	and	$3, %eax
+	jne	L(nb00)
+L(b00):	/* n = 4, 8, 12, ... */
+	mov	(up), %r10
+	mov	-8(up), %r11
+	xor	%eax, %eax
+	shld	%cl, %r10, %rax
+	mov	-16(up), %r8
+	lea	24(rp), rp
+	sub	$4, n
+	jmp	L(00)
+
+L(nb00):/* n = 1, 5, 9, ... */
+	cmp	$2, %eax
+	jae	L(nb01)
+L(b01):	mov	(up), %r9
+	xor	%eax, %eax
+	shld	%cl, %r9, %rax
+	sub	$2, n
+	jb	L(le1)
+	mov	-8(up), %r10
+	mov	-16(up), %r11
+	lea	-8(up), up
+	lea	16(rp), rp
+	jmp	L(01)
+L(le1):	shl	%cl, %r9
+	mov	%r9, (rp)
+	ret
+
+L(nb01):/* n = 2, 6, 10, ... */
+	jne	L(b11)
+L(b10):	mov	(up), %r8
+	mov	-8(up), %r9
+	xor	%eax, %eax
+	shld	%cl, %r8, %rax
+	sub	$3, n
+	jb	L(le2)
+	mov	-16(up), %r10
+	lea	-16(up), up
+	lea	8(rp), rp
+	jmp	L(10)
+L(le2):	shld	%cl, %r9, %r8
+	mov	%r8, (rp)
+	shl	%cl, %r9
+	mov	%r9, -8(rp)
+	ret
+
+	.p2align 4		/* performance critical! */
+L(b11):	/* n = 3, 7, 11, ... */
+	mov	(up), %r11
+	mov	-8(up), %r8
+	xor	%eax, %eax
+	shld	%cl, %r11, %rax
+	mov	-16(up), %r9
+	lea	-24(up), up
+	sub	$4, n
+	jb	L(end)
+
+	.p2align 4
+L(top):	shld	%cl, %r8, %r11
+	mov	(up), %r10
+	mov	%r11, (rp)
+L(10):	shld	%cl, %r9, %r8
+	mov	-8(up), %r11
+	mov	%r8, -8(rp)
+L(01):	shld	%cl, %r10, %r9
+	mov	-16(up), %r8
+	mov	%r9, -16(rp)
+L(00):	shld	%cl, %r11, %r10
+	mov	-24(up), %r9
+	mov	%r10, -24(rp)
+	add	$-32, up
+	lea	-32(rp), rp
+	sub	$4, n
+	jnc	L(top)
+
+L(end):	shld	%cl, %r8, %r11
+	mov	%r11, (rp)
+	shld	%cl, %r9, %r8
+	mov	%r8, -8(rp)
+	shl	%cl, %r9
+	mov	%r9, -16(rp)
 	ret
 END (__mpn_lshift)
diff --git a/sysdeps/x86_64/mul_1.S b/sysdeps/x86_64/mul_1.S
index 978916b..676afd1 100644
--- a/sysdeps/x86_64/mul_1.S
+++ b/sysdeps/x86_64/mul_1.S
@@ -1,6 +1,6 @@
 /* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
    the result in a second limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -21,22 +21,109 @@
 #include <sysdep.h>
 #include "asm-syntax.h"
 
+#define rp	%rdi
+#define up	%rsi
+#define n_param	%rdx
+#define vl	%rcx
+
+#define n	%r11
+
 	.text
 ENTRY (__mpn_mul_1)
-	movq	%rdx, %r11
-	leaq	(%rsi,%rdx,8), %rsi
-	leaq	(%rdi,%rdx,8), %rdi
-	negq	%r11
-	xorl	%r8d, %r8d
-L(loop):
-	movq	(%rsi,%r11,8), %rax
-	mulq	%rcx
-	addq	%r8, %rax
-	movl	$0, %r8d
-	adcq	%rdx, %r8
-	movq	%rax, (%rdi,%r11,8)
-	incq	%r11
-	jne	L(loop)
-	movq	%r8, %rax
+	push	%rbx
+	cfi_adjust_cfa_offset (8)
+	cfi_rel_offset (%rbx, 0)
+	xor	%r10, %r10
+	mov	(up), %rax		/* read first u limb early */
+	mov	n_param, %rbx		/* move away n from rdx, mul uses it */
+	mul	vl
+	mov	%rbx, %r11
+
+	add	%r10, %rax
+	adc	$0, %rdx
+
+	and	$3, %ebx
+	jz	L(b0)
+	cmp	$2, %ebx
+	jz	L(b2)
+	jg	L(b3)
+
+L(b1):	dec	n
+	jne	L(gt1)
+	mov	%rax, (rp)
+	jmp	L(ret)
+L(gt1):	lea	8(up,n,8), up
+	lea	-8(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	xor	%ebx, %ebx
+	mov	%rax, %r9
+	mov	(up,n,8), %rax
+	mov	%rdx, %r8
+	jmp	L(L1)
+
+L(b0):	lea	(up,n,8), up
+	lea	-16(rp,n,8), rp
+	neg	n
+	xor	%r10, %r10
+	mov	%rax, %r8
+	mov	%rdx, %rbx
+	jmp	L(L0)
+
+L(b3):	lea	-8(up,n,8), up
+	lea	-24(rp,n,8), rp
+	neg	n
+	mov	%rax, %rbx
+	mov	%rdx, %r10
+	jmp	L(L3)
+
+L(b2):	lea	-16(up,n,8), up
+	lea	-32(rp,n,8), rp
+	neg	n
+	xor	%r8, %r8
+	xor	%ebx, %ebx
+	mov	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%rdx, %r9
+	jmp	L(L2)
+
+	.p2align 4
+L(top): mov	%r10, (rp,n,8)
+	add	%rax, %r9
+	mov	(up,n,8), %rax
+	adc	%rdx, %r8
+	mov	$0, %r10d
+L(L1):	mul	vl
+	mov	%r9, 8(rp,n,8)
+	add	%rax, %r8
+	adc	%rdx, %rbx
+L(L0):	mov	8(up,n,8), %rax
+	mul	vl
+	mov	%r8, 16(rp,n,8)
+	add	%rax, %rbx
+	adc	%rdx, %r10
+L(L3):	mov	16(up,n,8), %rax
+	mul	vl
+	mov	%rbx, 24(rp,n,8)
+	mov	$0, %r8d                # zero
+	mov	%r8, %rbx               # zero
+	add	%rax, %r10
+	mov	24(up,n,8), %rax
+	mov	%r8, %r9                # zero
+	adc	%rdx, %r9
+L(L2):	mul	vl
+	add	$4, n
+	js	L(top)
+
+	mov	%r10, (rp,n,8)
+	add	%rax, %r9
+	adc	%r8, %rdx
+	mov	%r9, 8(rp,n,8)
+	add	%r8, %rdx
+L(ret):	mov	%rdx, %rax
+
+	pop	%rbx
+	cfi_adjust_cfa_offset (-8)
+	cfi_restore (%rbx)
 	ret
 END (__mpn_mul_1)
diff --git a/sysdeps/x86_64/rshift.S b/sysdeps/x86_64/rshift.S
index ee0c8aa..8ff0551 100644
--- a/sysdeps/x86_64/rshift.S
+++ b/sysdeps/x86_64/rshift.S
@@ -1,5 +1,5 @@
-/* AMD64 __mpn_rshift --
-   Copyright (C) 2004, 2006 Free Software Foundation, Inc.
+/* x86-64 __mpn_rshift --
+   Copyright (C) 2007, 2009 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -20,43 +20,96 @@
 #include "sysdep.h"
 #include "asm-syntax.h"
 
+#define rp	%rdi
+#define up	%rsi
+#define n	%rdx
+#define cnt	%cl
+
 	.text
 ENTRY (__mpn_rshift)
-	movq	(%rsi), %mm7
-	movd	%ecx, %mm1
-	movl	$64, %eax
-	subl	%ecx, %eax
-	movd	%eax, %mm0
-	movq	%mm7, %mm3
-	psllq	%mm0, %mm7
-	movd	%mm7, %rax
-	leaq	(%rsi,%rdx,8), %rsi
-	leaq	(%rdi,%rdx,8), %rdi
-	negq	%rdx
-	addq	$2, %rdx
-	jg	L(endo)
-	.p2align 2
-L(loop):
-	movq	-8(%rsi,%rdx,8), %mm6
-	movq	%mm6, %mm2
-	psllq	%mm0, %mm6
-	psrlq	%mm1, %mm3
-	por	%mm6, %mm3
-	movq	%mm3, -16(%rdi,%rdx,8)
-	je	L(ende)
-	movq	(%rsi,%rdx,8), %mm7
-	movq	%mm7, %mm3
-	psllq	%mm0, %mm7
-	psrlq	%mm1, %mm2
-	por	%mm7, %mm2
-	movq	%mm2, -8(%rdi,%rdx,8)
-	addq	$2, %rdx
-	jle	L(loop)
-L(endo):
-	movq	%mm3, %mm2
-L(ende):
-	psrlq	%mm1, %mm2
-	movq	%mm2, -8(%rdi)
-	emms
+	mov	%edx, %eax
+	and	$3, %eax
+	jne	L(nb00)
+L(b00):	/* n = 4, 8, 12, ... */
+	mov	(up), %r10
+	mov	8(up), %r11
+	xor	%eax, %eax
+	shrd	%cl, %r10, %rax
+	mov	16(up), %r8
+	lea	8(up), up
+	lea	-24(rp), rp
+	sub	$4, n
+	jmp	L(00)
+
+L(nb00):/* n = 1, 5, 9, ... */
+	cmp	$2, %eax
+	jae	L(nb01)
+L(b01):	mov	(up), %r9
+	xor	%eax, %eax
+	shrd	%cl, %r9, %rax
+	sub	$2, n
+	jb	L(le1)
+	mov	8(up), %r10
+	mov	16(up), %r11
+	lea	16(up), up
+	lea	-16(rp), rp
+	jmp	L(01)
+L(le1): shr	%cl, %r9
+	mov	%r9, (rp)
+	ret
+
+L(nb01):/* n = 2, 6, 10, ... */
+	jne	L(b11)
+L(b10):	mov	(up), %r8
+	mov	8(up), %r9
+	xor	%eax, %eax
+	shrd	%cl, %r8, %rax
+	sub	$3, n
+	jb	L(le2)
+	mov	16(up), %r10
+	lea	24(up), up
+	lea	-8(rp), rp
+	jmp	L(10)
+L(le2): shrd	%cl, %r9, %r8
+	mov	%r8, (rp)
+	shr	%cl, %r9
+	mov	%r9, 8(rp)
+	ret
+
+	.p2align 4
+L(b11):	/* n = 3, 7, 11, ... */
+	mov	(up), %r11
+	mov	8(up), %r8
+	xor	%eax, %eax
+	shrd	%cl, %r11, %rax
+	mov	16(up), %r9
+	lea	32(up), up
+	sub	$4, n
+	jb	L(end)
+
+	.p2align 4
+L(top):	shrd	%cl, %r8, %r11
+	mov	-8(up), %r10
+	mov	%r11, (rp)
+L(10):	shrd	%cl, %r9, %r8
+	mov	(up), %r11
+	mov	%r8, 8(rp)
+L(01):	shrd	%cl, %r10, %r9
+	mov	8(up), %r8
+	mov	%r9, 16(rp)
+L(00):	shrd	%cl, %r11, %r10
+	mov	16(up), %r9
+	mov	%r10, 24(rp)
+	add	$32, up
+	lea	32(rp), rp
+	sub	$4, n
+	jnc	L(top)
+
+L(end):	shrd	%cl, %r8, %r11
+	mov	%r11, (rp)
+	shrd	%cl, %r9, %r8
+	mov	%r8, 8(rp)
+	shr	%cl, %r9
+	mov	%r9, 16(rp)
 	ret
 END (__mpn_rshift)
diff --git a/sysdeps/x86_64/sub_n.S b/sysdeps/x86_64/sub_n.S
index 48e1a2e..60c15fc 100644
--- a/sysdeps/x86_64/sub_n.S
+++ b/sysdeps/x86_64/sub_n.S
@@ -1,6 +1,6 @@
-/* AMD64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
+/* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
    sum in a third limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2006, 2007 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -18,25 +18,7 @@
    the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
    MA 02111-1307, USA. */
 
-#include "sysdep.h"
-#include "asm-syntax.h"
+#define func __mpn_sub_n
+#define ADCSBB sbb
 
-	.text
-ENTRY (__mpn_sub_n)
-	leaq	(%rsi,%rcx,8), %rsi
-	leaq	(%rdi,%rcx,8), %rdi
-	leaq	(%rdx,%rcx,8), %rdx
-	negq	%rcx
-	xorl	%eax, %eax		# clear cy
-	.p2align 2
-L(loop):
-	movq	(%rsi,%rcx,8), %rax
-	movq	(%rdx,%rcx,8), %r10
-	sbbq	%r10, %rax
-	movq	%rax, (%rdi,%rcx,8)
-	incq	%rcx
-	jne	L(loop)
-	movq	%rcx, %rax		# zero %rax
-	adcq	%rax, %rax
-	ret
-END (__mpn_sub_n)
+#include "add_n.S"
diff --git a/sysdeps/x86_64/submul_1.S b/sysdeps/x86_64/submul_1.S
index e94c9a7..150a927 100644
--- a/sysdeps/x86_64/submul_1.S
+++ b/sysdeps/x86_64/submul_1.S
@@ -1,6 +1,6 @@
-/* AMD64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
+/* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
    the result from a second limb vector.
-   Copyright (C) 2004 Free Software Foundation, Inc.
+   Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc.
    This file is part of the GNU MP Library.
 
    The GNU MP Library is free software; you can redistribute it and/or modify
@@ -18,29 +18,7 @@
    the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
    MA 02111-1307, USA. */
 
-#include "sysdep.h"
-#include "asm-syntax.h"
+#define func __mpn_submul_1
+#define ADDSUB sub
 
-	.text
-ENTRY (__mpn_submul_1)
-	movq	%rdx, %r11
-	leaq	(%rsi,%r11,8), %rsi
-	leaq	(%rdi,%r11,8), %rdi
-	negq	%r11
-	xorl	%r8d, %r8d
-	.p2align 3
-L(loop):
-	movq	(%rsi,%r11,8), %rax
-	movq	(%rdi,%r11,8), %r10
-	mulq	%rcx
-	subq	%r8, %r10
-	movl	$0, %r8d
-	adcl	%r8d, %r8d
-	subq	%rax, %r10
-	adcq	%rdx, %r8
-	movq	%r10, (%rdi,%r11,8)
-	incq	%r11
-	jne	L(loop)
-	movq	%r8, %rax
-	ret
-END (__mpn_submul_1)
+#include "addmul_1.S"
-- 
2.7.4