From 8f5ca04bc7fd53741d80117df992995ace8f6d2d Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 16 Oct 1995 01:37:51 +0000 Subject: [PATCH] Sat Oct 14 02:52:36 1995 Ulrich Drepper * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include . * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. Sat Oct 14 02:52:36 1995 Ulrich Drepper * malloc/malloc.c (_malloc_internal): Performance fix. Move if statement out of loop. * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster implementation using GMP functions. Contributed by Torbjorn Granlund and Ulrich Drepper. * stdio/test_rdwr.c: Include . * sysdeps/i386/i586/Implies: New file. New highly optimized string functions for i[345]86. * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. * sysdeps/i386/i586/strlen.S: New file. * sysdeps/i386/memchr.c: Removed. There is now an assembler version. * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did not correspond to used values. * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper around a kernel header file. * sysdeps/unix/sysv/linux/Dist: Add it. * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): Likewise. * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of defining ourself we use a kernel header file. * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system call handler for i586. * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. --- .cvsignore | 2 +- ChangeLog | 40 ++++ configure.in | 28 +-- hurd/Makefile | 4 +- hurd/hurd.h | 69 ++---- hurd/hurdinit.c | 6 + stdio/_itoa.c | 401 +++++++++++++++++++++++++++++++++- stdio/_itoa.h | 26 --- stdio/test_rdwr.c | 1 + stdlib/gmp-impl.h | 66 ++++-- stdlib/gmp.h | 40 ++-- stdlib/longlong.h | 229 +++++++++++-------- sysdeps/alpha/add_n.s | 119 ++++++++++ sysdeps/alpha/addmul_1.s | 100 +++++++++ sysdeps/alpha/alphaev5/add_n.s | 118 ++++++++++ sysdeps/alpha/alphaev5/lshift.s | 175 +++++++++++++++ sysdeps/alpha/alphaev5/rshift.s | 173 +++++++++++++++ sysdeps/alpha/lshift.s | 108 +++++++++ sysdeps/alpha/mul_1.s | 84 +++++++ sysdeps/alpha/rshift.s | 106 +++++++++ sysdeps/alpha/sub_n.s | 119 ++++++++++ sysdeps/alpha/submul_1.s | 100 +++++++++ sysdeps/alpha/udiv_qrnnd.S | 2 +- sysdeps/generic/divmod_1.c | 6 +- sysdeps/generic/mod_1.c | 2 - sysdeps/hppa/add_n.s | 57 +++++ sysdeps/hppa/hppa1.1/addmul_1.s | 101 +++++++++ sysdeps/hppa/hppa1.1/mul_1.s | 97 ++++++++ sysdeps/hppa/hppa1.1/submul_1.s | 110 ++++++++++ sysdeps/hppa/hppa1.1/udiv_qrnnd.s | 74 +++++++ sysdeps/hppa/lshift.s | 65 ++++++ sysdeps/hppa/rshift.s | 62 ++++++ sysdeps/hppa/sub_n.s | 58 +++++ sysdeps/hppa/udiv_qrnnd.s | 285 ++++++++++++++++++++++++ sysdeps/i386/add_n.S | 18 +- sysdeps/i386/gmp-mparam.h | 28 +++ sysdeps/i386/i486/strcat.S | 260 ++++++++++++++++++++++ sysdeps/i386/i486/strlen.S | 132 +++++++++++ sysdeps/i386/i586/Implies | 2 + sysdeps/i386/i586/add_n.S | 136 ++++++++++++ sysdeps/i386/i586/addmul_1.S | 84 +++++++ sysdeps/i386/i586/lshift.S | 213 ++++++++++++++++++ sysdeps/i386/i586/memcopy.h | 6 +- sysdeps/i386/i586/mul_1.S | 78 +++++++ sysdeps/i386/i586/rshift.S | 213 ++++++++++++++++++ sysdeps/i386/i586/strchr.S | 334 ++++++++++++++++++++++++++++ sysdeps/i386/i586/strlen.S | 185 ++++++++++++++++ sysdeps/i386/i586/sub_n.S | 136 ++++++++++++ sysdeps/i386/i586/submul_1.S | 82 +++++++ sysdeps/i386/memchr.S | 315 ++++++++++++++++++++++++++ sysdeps/i386/memchr.c | 48 ---- sysdeps/i386/memcmp.S | 68 ++++++ sysdeps/i386/stpcpy.S | 87 ++++++++ sysdeps/i386/stpncpy.S | 143 ++++++++++++ sysdeps/i386/strchr.S | 278 +++++++++++++++++++++++ sysdeps/i386/strcspn.S | 176 +++++++++++++++ sysdeps/i386/strpbrk.S | 177 +++++++++++++++ sysdeps/i386/strrchr.S | 321 +++++++++++++++++++++++++++ sysdeps/i386/strspn.S | 176 +++++++++++++++ sysdeps/i386/sub_n.S | 26 ++- sysdeps/i960/add_n.s | 21 ++ sysdeps/i960/addmul_1.s | 26 +++ sysdeps/i960/mul_1.s | 23 ++ sysdeps/i960/sub_n.s | 21 ++ sysdeps/m88k/m88100/add_n.s | 103 +++++++++ sysdeps/m88k/m88100/mul_1.s | 128 +++++++++++ sysdeps/m88k/m88100/sub_n.s | 104 +++++++++ sysdeps/m88k/m88110/mul_1.s | 84 +++++++ sysdeps/mips/add_n.s | 119 ++++++++++ sysdeps/mips/addmul_1.s | 96 ++++++++ sysdeps/mips/lshift.s | 94 ++++++++ sysdeps/mips/mips3/add_n.s | 119 ++++++++++ sysdeps/mips/mips3/addmul_1.s | 96 ++++++++ sysdeps/mips/mips3/gmp-mparam.h | 26 +++ sysdeps/mips/mips3/lshift.s | 94 ++++++++ sysdeps/mips/mips3/mul_1.s | 84 +++++++ sysdeps/mips/mips3/rshift.s | 91 ++++++++ sysdeps/mips/mips3/sub_n.s | 119 ++++++++++ sysdeps/mips/mips3/submul_1.s | 96 ++++++++ sysdeps/mips/mul_1.s | 84 +++++++ sysdeps/mips/rshift.s | 91 ++++++++ sysdeps/mips/sub_n.s | 119 ++++++++++ sysdeps/mips/submul_1.s | 96 ++++++++ sysdeps/rs6000/add_n.s | 54 +++++ sysdeps/rs6000/addmul_1.s | 122 +++++++++++ sysdeps/rs6000/lshift.s | 58 +++++ sysdeps/rs6000/mul_1.s | 109 +++++++++ sysdeps/rs6000/rshift.s | 56 +++++ sysdeps/rs6000/sub_n.s | 55 +++++ sysdeps/rs6000/submul_1.s | 127 +++++++++++ sysdeps/sparc/add_n.S | 15 +- sysdeps/sparc/sparc8/addmul_1.S | 7 + sysdeps/sparc/sparc8/mul_1.S | 7 + sysdeps/sparc/sub_n.S | 15 +- sysdeps/unix/sysv/linux/Dist | 1 + sysdeps/unix/sysv/linux/Makefile | 6 +- sysdeps/unix/sysv/linux/i386/sysdep.h | 76 ++++--- sysdeps/unix/sysv/linux/local_lim.h | 17 +- sysdeps/unix/sysv/linux/nfs/nfs.h | 1 + sysdeps/unix/sysv/linux/sys/param.h | 31 ++- sysdeps/vax/add_n.s | 47 ++++ sysdeps/vax/addmul_1.s | 125 +++++++++++ sysdeps/vax/gmp-mparam.h | 28 +++ sysdeps/vax/mul_1.s | 122 +++++++++++ sysdeps/vax/sub_n.s | 47 ++++ sysdeps/vax/submul_1.s | 125 +++++++++++ sysdeps/z8000/add_n.s | 52 +++++ sysdeps/z8000/mul_1.s | 67 ++++++ sysdeps/z8000/sub_n.s | 53 +++++ 109 files changed, 9753 insertions(+), 359 deletions(-) create mode 100644 sysdeps/alpha/add_n.s create mode 100644 sysdeps/alpha/addmul_1.s create mode 100644 sysdeps/alpha/alphaev5/add_n.s create mode 100644 sysdeps/alpha/alphaev5/lshift.s create mode 100644 sysdeps/alpha/alphaev5/rshift.s create mode 100644 sysdeps/alpha/lshift.s create mode 100644 sysdeps/alpha/mul_1.s create mode 100644 sysdeps/alpha/rshift.s create mode 100644 sysdeps/alpha/sub_n.s create mode 100644 sysdeps/alpha/submul_1.s create mode 100644 sysdeps/hppa/add_n.s create mode 100644 sysdeps/hppa/hppa1.1/addmul_1.s create mode 100644 sysdeps/hppa/hppa1.1/mul_1.s create mode 100644 sysdeps/hppa/hppa1.1/submul_1.s create mode 100644 sysdeps/hppa/hppa1.1/udiv_qrnnd.s create mode 100644 sysdeps/hppa/lshift.s create mode 100644 sysdeps/hppa/rshift.s create mode 100644 sysdeps/hppa/sub_n.s create mode 100644 sysdeps/hppa/udiv_qrnnd.s create mode 100644 sysdeps/i386/gmp-mparam.h create mode 100644 sysdeps/i386/i486/strcat.S create mode 100644 sysdeps/i386/i486/strlen.S create mode 100644 sysdeps/i386/i586/Implies create mode 100644 sysdeps/i386/i586/add_n.S create mode 100644 sysdeps/i386/i586/addmul_1.S create mode 100644 sysdeps/i386/i586/lshift.S create mode 100644 sysdeps/i386/i586/mul_1.S create mode 100644 sysdeps/i386/i586/rshift.S create mode 100644 sysdeps/i386/i586/strchr.S create mode 100644 sysdeps/i386/i586/strlen.S create mode 100644 sysdeps/i386/i586/sub_n.S create mode 100644 sysdeps/i386/i586/submul_1.S create mode 100644 sysdeps/i386/memchr.S delete mode 100644 sysdeps/i386/memchr.c create mode 100644 sysdeps/i386/memcmp.S create mode 100644 sysdeps/i386/stpcpy.S create mode 100644 sysdeps/i386/stpncpy.S create mode 100644 sysdeps/i386/strchr.S create mode 100644 sysdeps/i386/strcspn.S create mode 100644 sysdeps/i386/strpbrk.S create mode 100644 sysdeps/i386/strrchr.S create mode 100644 sysdeps/i386/strspn.S create mode 100644 sysdeps/i960/add_n.s create mode 100644 sysdeps/i960/addmul_1.s create mode 100644 sysdeps/i960/mul_1.s create mode 100644 sysdeps/i960/sub_n.s create mode 100644 sysdeps/m88k/m88100/add_n.s create mode 100644 sysdeps/m88k/m88100/mul_1.s create mode 100644 sysdeps/m88k/m88100/sub_n.s create mode 100644 sysdeps/m88k/m88110/mul_1.s create mode 100644 sysdeps/mips/add_n.s create mode 100644 sysdeps/mips/addmul_1.s create mode 100644 sysdeps/mips/lshift.s create mode 100644 sysdeps/mips/mips3/add_n.s create mode 100644 sysdeps/mips/mips3/addmul_1.s create mode 100644 sysdeps/mips/mips3/gmp-mparam.h create mode 100644 sysdeps/mips/mips3/lshift.s create mode 100644 sysdeps/mips/mips3/mul_1.s create mode 100644 sysdeps/mips/mips3/rshift.s create mode 100644 sysdeps/mips/mips3/sub_n.s create mode 100644 sysdeps/mips/mips3/submul_1.s create mode 100644 sysdeps/mips/mul_1.s create mode 100644 sysdeps/mips/rshift.s create mode 100644 sysdeps/mips/sub_n.s create mode 100644 sysdeps/mips/submul_1.s create mode 100644 sysdeps/rs6000/add_n.s create mode 100644 sysdeps/rs6000/addmul_1.s create mode 100644 sysdeps/rs6000/lshift.s create mode 100644 sysdeps/rs6000/mul_1.s create mode 100644 sysdeps/rs6000/rshift.s create mode 100644 sysdeps/rs6000/sub_n.s create mode 100644 sysdeps/rs6000/submul_1.s create mode 100644 sysdeps/unix/sysv/linux/nfs/nfs.h create mode 100644 sysdeps/vax/add_n.s create mode 100644 sysdeps/vax/addmul_1.s create mode 100644 sysdeps/vax/gmp-mparam.h create mode 100644 sysdeps/vax/mul_1.s create mode 100644 sysdeps/vax/sub_n.s create mode 100644 sysdeps/vax/submul_1.s create mode 100644 sysdeps/z8000/add_n.s create mode 100644 sysdeps/z8000/mul_1.s create mode 100644 sysdeps/z8000/sub_n.s diff --git a/.cvsignore b/.cvsignore index cf479d2..ff3f67f 100644 --- a/.cvsignore +++ b/.cvsignore @@ -5,7 +5,7 @@ glibc-* configparms -sun4 i386 i386-gnuelf hp300-netbsd hp300 i486-linux +sun[43]* i[345]86* hp300* ieeetest hppa-sysdeps regex diff --git a/ChangeLog b/ChangeLog index 618dd3e..d8a781c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,43 @@ +Sat Oct 14 02:52:36 1995 Ulrich Drepper + + * malloc/malloc.c (_malloc_internal): Performance fix. Move + if statement out of loop. + + * stdio/_itoa.c, stdio/_itoa.h: Complete rewrite. Much faster + implementation using GMP functions. Contributed by + Torbjorn Granlund and Ulrich Drepper. + + * stdio/test_rdwr.c: Include . + + * sysdeps/i386/i586/Implies: New file. + + New highly optimized string functions for i[345]86. + * sysdeps/i386/memchr.S, sysdeps/i386/memcmp.S: New files. + * sysdeps/i386/stpcpy.S, sysdeps/i386/stpncpy.S: New files. + * sysdeps/i386/strchr.S, sysdeps/i386/strcspn.S: New files. + * sysdeps/i386/strpbrk.S, sysdeps/i386/strrchr.S: New files. + * sysdeps/i386/strspn.S, sysdeps/i386/i486/strcat.S: New files. + * sysdeps/i386/i486/strlen.S, sysdeps/i386/i586/strchr.S: New files. + * sysdeps/i386/i586/strlen.S: New file. + * sysdeps/i386/memchr.c: Removed. There is now an assembler version. + + * sysdeps/i386/i586/memcopy.h (WORD_COPY_BWD): Parameters did + not correspond to used values. + + * sysdeps/unix/sysv/linux/nfs/nfs.h: New file. Simply a wrapper + around a kernel header file. + * sysdeps/unix/sysv/linux/Dist: Add it. + * sysdeps/unix/sysv/linux/Makefile [$(subdir)=sunrpc] (headers): + Likewise. + + * sysdeps/unix/sysv/linux/local_lim.h: Rewrite. Instead of + defining ourself we use a kernel header file. + + * sysdeps/unix/sysv/linux/i386/sysdep.h (DO_CALL): Optimize system + call handler for i586. + + * sysdeps/unix/sysv/linux/sys/param.h: Add copyright and clean up. + Wed Oct 11 00:00:00 1995 Roland McGrath * sysdeps/i386/dl-machine.h (elf_machine_rel): Use +=, not =, to diff --git a/configure.in b/configure.in index 05191be..e7d4ecb 100644 --- a/configure.in +++ b/configure.in @@ -82,22 +82,18 @@ changequote(,)dnl # Expand the configuration machine name into a subdirectory by architecture # type and particular chip. case "$machine" in -i[345]86) - machine=i386/$machine ;; -sparc[6789]) - machine=sparc/$machine ;; -m68k) - machine=m68k/m68020 ;; -m680?0) - machine=m68k/$machine ;; -m88k) - machine=m88k/m88100 ;; -m88???) - machine=m88k/$machine ;; -mips64*) - machine=mips/mips64/$machine ;; -mips*) - machine=mips/$machine ;; +a29k | am29000) machine=a29k ;; +alpha*) machine=alpha/$machine ;; +hppa*) machine=hppa/$machine ;; +i[345]86) machine=i386/$machine ;; +m680?0) machine=m68k/$machine ;; +m68k) machine=m68k/m68020 ;; +m88???) machine=m88k/$machine ;; +m88k) machine=m88k/m88100 ;; +mips*) machine=mips/$machine ;; +mips64*) machine=mips/mips64/$machine ;; +sparc[6789]) machine=sparc/$machine ;; +supersparc) machine=sparc/sparc8 ;; esac # Make sco3.2v4 become sco3.2.4 and sunos4.1.1_U1 become sunos4.1.1.U1. diff --git a/hurd/Makefile b/hurd/Makefile index 582f37b..53b7348 100644 --- a/hurd/Makefile +++ b/hurd/Makefile @@ -26,7 +26,7 @@ include ../Makeconfig headers = hurd.h $(interface-headers) \ $(addprefix hurd/,fd.h id.h port.h signal.h userlink.h \ - resource.h threadvar.h) + resource.h threadvar.h lookup.h) distribute := hurdstartup.h hurdfault.h intr-rpc.defs STATUS @@ -44,7 +44,7 @@ routines = hurdstartup hurdinit \ setauth \ pid2task task2pid \ getuids setuids getumask fchroot \ - hurdsock hurdauth invoke-trans \ + hurdsock hurdauth \ privports \ msgportdemux \ fopenport \ diff --git a/hurd/hurd.h b/hurd/hurd.h index acad15b..17b5c45 100644 --- a/hurd/hurd.h +++ b/hurd/hurd.h @@ -77,11 +77,16 @@ extern struct hurd_port *_hurd_ports; extern unsigned int _hurd_nports; extern volatile mode_t _hurd_umask; -/* Shorthand macro for referencing _hurd_ports (see ). */ +/* Shorthand macro for internal library code referencing _hurd_ports (see + ). */ #define __USEPORT(which, expr) \ HURD_PORT_USE (&_hurd_ports[INIT_PORT_##which], (expr)) +/* Function version of __USEPORT: calls OPERATE with a send right. */ + +extern error_t _hurd_ports_use (int which, error_t (*operate) (mach_port_t)); + /* Base address and size of the initial stack set up by the exec server. If using cthreads, this stack is deallocated in startup. @@ -150,52 +155,6 @@ extern int setcttyid (mach_port_t); extern int __setauth (auth_t), setauth (auth_t); -/* Split FILE into a directory and a name within the directory. Look up a - port for the directory and store it in *DIR; store in *NAME a pointer - into FILE where the name within directory begins. The directory lookup - uses CRDIR for the root directory and CWDIR for the current directory. - Returns zero on success or an error code. */ - -extern error_t __hurd_file_name_split (file_t crdir, file_t cwdir, - const char *file, - file_t *dir, char **name); -extern error_t hurd_file_name_split (file_t crdir, file_t cwdir, - const char *file, - file_t *dir, char **name); - -/* Open a port to FILE with the given FLAGS and MODE (see ). - The file lookup uses CRDIR for the root directory and CWDIR for the - current directory. If successful, returns zero and store the port - to FILE in *PORT; otherwise returns an error code. */ - -extern error_t __hurd_file_name_lookup (file_t crdir, file_t cwdir, - const char *file, - int flags, mode_t mode, - file_t *port); -extern error_t hurd_file_name_lookup (file_t crdir, file_t cwdir, - const char *filename, - int flags, mode_t mode, - file_t *port); - -/* Process the values returned by `dir_lookup' et al, and loop doing - `dir_lookup' calls until one returns FS_RETRY_NONE. CRDIR is the - root directory used for things like symlinks to absolute file names; the - other arguments should be those just passed to and/or returned from - `dir_lookup', `fsys_getroot', or `file_invoke_translator'. This - function consumes the reference in *RESULT even if it returns an error. */ - -extern error_t __hurd_file_name_lookup_retry (file_t crdir, - enum retry_type doretry, - char retryname[1024], - int flags, mode_t mode, - file_t *result); -extern error_t hurd_file_name_lookup_retry (file_t crdir, - enum retry_type doretry, - char retryname[1024], - int flags, mode_t mode, - file_t *result); - - /* Split FILE into a directory and a name within the directory. The directory lookup uses the current root and working directory. If successful, stores in *NAME a pointer into FILE where the name @@ -213,15 +172,15 @@ extern file_t file_name_split (const char *file, char **name); extern file_t __file_name_lookup (const char *file, int flags, mode_t mode); extern file_t file_name_lookup (const char *file, int flags, mode_t mode); -/* Invoke any translator set on the node FILE represents, and return in - *TRANSLATED a port to the translated node. FLAGS are as for - `dir_lookup' et al, but the returned port will not necessarily have - any more access rights than FILE does. */ +/* Open a port to FILE with the given FLAGS and MODE (see ). The + file lookup uses the current root directory, but uses STARTDIR as the + "working directory" for file relative names. Returns a port to the file + if successful; otherwise sets `errno' and returns MACH_PORT_NULL. */ -extern error_t __hurd_invoke_translator (file_t file, int flags, - file_t *translated); -extern error_t hurd_invoke_translator (file_t file, int flags, - file_t *translated); +extern file_t __file_name_lookup_under (file_t startdir, const char *file, + int flags, mode_t mode); +extern file_t file_name_lookup_under (file_t startdir, const char *file, + int flags, mode_t mode); /* Open a file descriptor on a port. FLAGS are as for `open'; flags diff --git a/hurd/hurdinit.c b/hurd/hurdinit.c index af89211..409d2d1 100644 --- a/hurd/hurdinit.c +++ b/hurd/hurdinit.c @@ -31,6 +31,12 @@ struct hurd_port *_hurd_ports; unsigned int _hurd_nports; mode_t _hurd_umask; +error_t +_hurd_ports_use (int which, error_t (*operate) (mach_port_t)) +{ + return HURD_PORT_USE (&_hurd_ports[which], (*operate) (port)); +} + void _hurd_proc_init (char **argv); DEFINE_HOOK (_hurd_subinit, (void)); diff --git a/stdio/_itoa.c b/stdio/_itoa.c index 19e732d..caa8179 100644 --- a/stdio/_itoa.c +++ b/stdio/_itoa.c @@ -1,6 +1,8 @@ /* Internal function for converting integers to ASCII. Copyright (C) 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU C Library. +Contributed by Torbjorn Granlund +and Ulrich Drepper . The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as @@ -17,13 +19,400 @@ License along with the GNU C Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include +#include "../stdlib/gmp.h" +#include "../stdlib/gmp-impl.h" +#include "../stdlib/longlong.h" + +#include "_itoa.h" + + +/* Canonize environment. For some architectures not all values might + be defined in the GMP header files. */ +#ifndef UMUL_TIME +# define UMUL_TIME 1 +#endif +#ifndef UDIV_TIME +# define UDIV_TIME 1 +#endif + +/* Control memory layout. */ +#ifdef PACK +# undef PACK +# define PACK __attribute__ ((packed)) +#else +# define PACK +#endif + + +/* Declare local types. */ +struct base_table_t +{ +#if (UDIV_TIME > 2 * UMUL_TIME) + mp_limb base_multiplier; +#endif + char flag; + char post_shift; +#if BITS_PER_MP_LIMB == 32 + struct + { + char normalization_steps; + char ndigits; + mp_limb base PACK; +#if UDIV_TIME > 2 * UMUL_TIME + mp_limb base_ninv PACK; +#endif + } big; +#endif +}; + +/* To reduce the memory needed we include some fields of the tables + only confitionally. */ +#if BITS_PER_MP_LIMB == 32 +# if UDIV_TIME > 2 * UMUL_TIME +# define SEL1(X) X, +# define SEL2(X) ,X +# else +# define SEL1(X) +# define SEL2(X) +# endif +#endif + + +/* Local variables. */ +static const struct base_table_t base_table[] = +{ +#if BITS_PER_MP_LIMB == 64 + /* 2 */ {0ul, 1, 1}, + /* 3 */ {0xaaaaaaaaaaaaaaabul, 0, 1}, + /* 4 */ {0ul, 1, 2}, + /* 5 */ {0xcccccccccccccccdul, 0, 2}, + /* 6 */ {0xaaaaaaaaaaaaaaabul, 0, 2}, + /* 7 */ {0x2492492492492493ul, 1, 3}, + /* 8 */ {0ul, 1, 3}, + /* 9 */ {0xe38e38e38e38e38ful, 0, 3}, + /* 10 */ {0xcccccccccccccccdul, 0, 3}, + /* 11 */ {0x2e8ba2e8ba2e8ba3ul, 0, 1}, + /* 12 */ {0xaaaaaaaaaaaaaaabul, 0, 3}, + /* 13 */ {0x4ec4ec4ec4ec4ec5ul, 0, 2}, + /* 14 */ {0x2492492492492493ul, 1, 4}, + /* 15 */ {0x8888888888888889ul, 0, 3}, + /* 16 */ {0ul, 1, 4}, + /* 17 */ {0xf0f0f0f0f0f0f0f1ul, 0, 4}, + /* 18 */ {0xe38e38e38e38e38ful, 0, 4}, + /* 19 */ {0xd79435e50d79435ful, 0, 4}, + /* 20 */ {0xcccccccccccccccdul, 0, 4}, + /* 21 */ {0x8618618618618619ul, 1, 5}, + /* 22 */ {0x2e8ba2e8ba2e8ba3ul, 0, 2}, + /* 23 */ {0x642c8590b21642c9ul, 1, 5}, + /* 24 */ {0xaaaaaaaaaaaaaaabul, 0, 4}, + /* 25 */ {0x47ae147ae147ae15ul, 1, 5}, + /* 26 */ {0x4ec4ec4ec4ec4ec5ul, 0, 3}, + /* 27 */ {0x97b425ed097b425ful, 0, 4}, + /* 28 */ {0x2492492492492493ul, 1, 5}, + /* 29 */ {0x1a7b9611a7b9611bul, 1, 5}, + /* 30 */ {0x8888888888888889ul, 0, 4}, + /* 31 */ {0x0842108421084211ul, 1, 5}, + /* 32 */ {0ul, 1, 5}, + /* 33 */ {0x0f83e0f83e0f83e1ul, 0, 1}, + /* 34 */ {0xf0f0f0f0f0f0f0f1ul, 0, 5}, + /* 35 */ {0xea0ea0ea0ea0ea0ful, 0, 5}, + /* 36 */ {0xe38e38e38e38e38ful, 0, 5} +#endif +#if BITS_PER_MP_LIMB == 32 + /* 2 */ {SEL1(0ul) 1, 1, {0, 31, 0x80000000ul SEL2(0xfffffffful)}}, + /* 3 */ {SEL1(0xaaaaaaabul) 0, 1, {0, 20, 0xcfd41b91ul SEL2(0x3b563c24ul)}}, + /* 4 */ {SEL1(0ul) 1, 2, {1, 15, 0x40000000ul SEL2(0xfffffffful)}}, + /* 5 */ {SEL1(0xcccccccdul) 0, 2, {1, 13, 0x48c27395ul SEL2(0xc25c2684ul)}}, + /* 6 */ {SEL1(0xaaaaaaabul) 0, 2, {0, 12, 0x81bf1000ul SEL2(0xf91bd1b6ul)}}, + /* 7 */ {SEL1(0x24924925ul) 1, 3, {1, 11, 0x75db9c97ul SEL2(0x1607a2cbul)}}, + /* 8 */ {SEL1(0ul) 1, 3, {1, 10, 0x40000000ul SEL2(0xfffffffful)}}, + /* 9 */ {SEL1(0x38e38e39ul) 0, 1, {0, 10, 0xcfd41b91ul SEL2(0x3b563c24ul)}}, + /* 10 */ {SEL1(0xcccccccdul) 0, 3, {2, 9, 0x3b9aca00ul SEL2(0x12e0be82ul)}}, + /* 11 */ {SEL1(0xba2e8ba3ul) 0, 3, {0, 9, 0x8c8b6d2bul SEL2(0xd24cde04ul)}}, + /* 12 */ {SEL1(0xaaaaaaabul) 0, 3, {3, 8, 0x19a10000ul SEL2(0x3fa39ab5ul)}}, + /* 13 */ {SEL1(0x4ec4ec4ful) 0, 2, {2, 8, 0x309f1021ul SEL2(0x50f8ac5ful)}}, + /* 14 */ {SEL1(0x24924925ul) 1, 4, {1, 8, 0x57f6c100ul SEL2(0x74843b1eul)}}, + /* 15 */ {SEL1(0x88888889ul) 0, 3, {0, 8, 0x98c29b81ul SEL2(0xad0326c2ul)}}, + /* 16 */ {SEL1(0ul) 1, 4, {3, 7, 0x10000000ul SEL2(0xfffffffful)}}, + /* 17 */ {SEL1(0xf0f0f0f1ul) 0, 4, {3, 7, 0x18754571ul SEL2(0x4ef0b6bdul)}}, + /* 18 */ {SEL1(0x38e38e39ul) 0, 2, {2, 7, 0x247dbc80ul SEL2(0xc0fc48a1ul)}}, + /* 19 */ {SEL1(0xaf286bcbul) 1, 5, {2, 7, 0x3547667bul SEL2(0x33838942ul)}}, + /* 20 */ {SEL1(0xcccccccdul) 0, 4, {1, 7, 0x4c4b4000ul SEL2(0xad7f29abul)}}, + /* 21 */ {SEL1(0x86186187ul) 1, 5, {1, 7, 0x6b5a6e1dul SEL2(0x313c3d15ul)}}, + /* 22 */ {SEL1(0xba2e8ba3ul) 0, 4, {0, 7, 0x94ace180ul SEL2(0xb8cca9e0ul)}}, + /* 23 */ {SEL1(0xb21642c9ul) 0, 4, {0, 7, 0xcaf18367ul SEL2(0x42ed6de9ul)}}, + /* 24 */ {SEL1(0xaaaaaaabul) 0, 4, {4, 6, 0x0b640000ul SEL2(0x67980e0bul)}}, + /* 25 */ {SEL1(0x51eb851ful) 0, 3, {4, 6, 0x0e8d4a51ul SEL2(0x19799812ul)}}, + /* 26 */ {SEL1(0x4ec4ec4ful) 0, 3, {3, 6, 0x1269ae40ul SEL2(0xbce85396ul)}}, + /* 27 */ {SEL1(0x2f684bdbul) 1, 5, {3, 6, 0x17179149ul SEL2(0x62c103a9ul)}}, + /* 28 */ {SEL1(0x24924925ul) 1, 5, {3, 6, 0x1cb91000ul SEL2(0x1d353d43ul)}}, + /* 29 */ {SEL1(0x8d3dcb09ul) 0, 4, {2, 6, 0x23744899ul SEL2(0xce1deceaul)}}, + /* 30 */ {SEL1(0x88888889ul) 0, 4, {2, 6, 0x2b73a840ul SEL2(0x790fc511ul)}}, + /* 31 */ {SEL1(0x08421085ul) 1, 5, {2, 6, 0x34e63b41ul SEL2(0x35b865a0ul)}}, + /* 32 */ {SEL1(0ul) 1, 5, {1, 6, 0x40000000ul SEL2(0xfffffffful)}}, + /* 33 */ {SEL1(0x3e0f83e1ul) 0, 3, {1, 6, 0x4cfa3cc1ul SEL2(0xa9aed1b3ul)}}, + /* 34 */ {SEL1(0xf0f0f0f1ul) 0, 5, {1, 6, 0x5c13d840ul SEL2(0x63dfc229ul)}}, + /* 35 */ {SEL1(0xd41d41d5ul) 1, 6, {1, 6, 0x6d91b519ul SEL2(0x2b0fee30ul)}}, + /* 36 */ {SEL1(0x38e38e39ul) 0, 3, {0, 6, 0x81bf1000ul SEL2(0xf91bd1b6ul)}} +#endif +}; + /* Lower-case digits. */ -const char _itoa_lower_digits[] = "0123456789abcdefghijklmnopqrstuvwxyz"; +static const char _itoa_lower_digits[] + = "0123456789abcdefghijklmnopqrstuvwxyz"; /* Upper-case digits. */ -const char _itoa_upper_digits[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; +static const char _itoa_upper_digits[] + = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"; -/* Cause _itoa.h to define _itoa as a real function instead of an - `extern inline'. */ -#define _EXTERN_INLINE /* empty */ -#include "_itoa.h" +char * +_itoa (value, buflim, base, upper_case) + unsigned long long int value; + char *buflim; + unsigned int base; + int upper_case; +{ + const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits; + char *bp = buflim; + const struct base_table_t *brec = &base_table[base - 2]; + + switch (base) + { +#define RUN_2N(BITS) \ + do \ + { \ + /* `unsigned long long int' always has 64 bits. */ \ + mp_limb work_hi = value >> (64 - BITS_PER_MP_LIMB); \ + \ + if (BITS_PER_MP_LIMB == 32) \ + if (work_hi != 0) \ + { \ + mp_limb work_lo; \ + int cnt; \ + \ + work_lo = value & 0xfffffffful; \ + for (cnt = BITS_PER_MP_LIMB / BITS; cnt > 0; --cnt) \ + { \ + *--bp = digits[work_lo & ((1ul << BITS) - 1)]; \ + work_lo >>= BITS; \ + } \ + if (BITS_PER_MP_LIMB % BITS != 0) \ + { \ + work_lo |= ((work_hi \ + & ((1 << BITS - BITS_PER_MP_LIMB % BITS) \ + - 1)) \ + << BITS_PER_MP_LIMB % BITS); \ + *--bp = digits[work_lo]; \ + work_hi >>= BITS - BITS_PER_MP_LIMB % BITS; \ + } \ + } \ + else \ + work_hi = value & 0xfffffffful; \ + do \ + { \ + *--bp = digits[work_hi & ((1 << BITS) - 1)]; \ + work_hi >>= BITS; \ + } \ + while (work_hi != 0); \ + } \ + while (0) + case 8: + RUN_2N (3); + break; + + case 16: + RUN_2N (4); + break; + + default: + { +#if BITS_PER_MP_LIMB == 64 + mp_limb base_multiplier = brec->base_multiplier; + if (brec->flag) + while (value != 0) + { + mp_limb quo, rem, x, dummy; + + umul_ppmm (x, dummy, value, base_multiplier); + quo = (x + ((value - x) >> 1)) >> (brec->post_shift - 1); + rem = value - quo * base; + *--bp = digits[rem]; + value = quo; + } + else + while (value != 0) + { + mp_limb quo, rem, x, dummy; + + umul_ppmm (x, dummy, value, base_multiplier); + quo = x >> brec->post_shift; + rem = value - quo * base; + *--bp = digits[rem]; + value = quo; + } +#endif +#if BITS_PER_MP_LIMB == 32 + mp_limb t[3]; + int n; + + /* First convert x0 to 1-3 words in base s->big.base. + Optimize for frequent cases of 32 bit numbers. */ + if ((mp_limb) (value >> 32) >= 1) + { + int big_normalization_steps = brec->big.normalization_steps; + mp_limb big_base_norm = brec->big.base << big_normalization_steps; + + if ((mp_limb) (value >> 32) >= brec->big.base) + { + mp_limb x1hi, x1lo, r; + /* If you want to optimize this, take advantage of + that the quotient in the first udiv_qrnnd will + always be very small. It might be faster just to + subtract in a tight loop. */ + +#if UDIV_TIME > 2 * UMUL_TIME + mp_limb x, xh, xl; + + if (big_normalization_steps == 0) + xh = 0; + else + xh = (mp_limb) (value >> 64 - big_normalization_steps); + xl = (mp_limb) (value >> 32 - big_normalization_steps); + udiv_qrnnd_preinv (x1hi, r, xh, xl, big_base_norm, + brec->big.base_ninv); + + xl = ((mp_limb) value) << big_normalization_steps; + udiv_qrnnd_preinv (x1lo, x, r, xl, big_base_norm, + big_normalization_steps); + t[2] = x >> big_normalization_steps; + + if (big_normalization_steps == 0) + xh = x1hi; + else + xh = ((x1hi << big_normalization_steps) + | (x1lo >> 32 - big_normalization_steps)); + xl = x1lo << big_normalization_steps; + udiv_qrnnd_preinv (t[0], x, xh, xl, big_base_norm, + big_normalization_steps); + t[1] = x >> big_normalization_steps; +#elif UDIV_NEEDS_NORMALIZATION + mp_limb x, xh, xl; + + if (big_normalization_steps == 0) + xh = 0; + else + xh = (mp_limb) (value >> 64 - big_normalization_steps); + xl = (mp_limb) (value >> 32 - big_normalization_steps); + udiv_qrnnd (x1hi, r, xh, xl, big_base_norm); + + xl = ((mp_limb) value) << big_normalization_steps; + udiv_qrnnd (x1lo, x, r, xl, big_base_norm); + t[2] = x >> big_normalization_steps; + + if (big_normalization_steps == 0) + xh = x1hi; + else + xh = ((x1hi << big_normalization_steps) + | (x1lo >> 32 - big_normalization_steps)); + xl = x1lo << big_normalization_steps; + udiv_qrnnd (t[0], x, xh, xl, big_base_norm); + t[1] = x >> big_normalization_steps; +#else + udiv_qrnnd (x1hi, r, 0, (mp_limb) (value >> 32), + brec->big.base); + udiv_qrnnd (x1lo, t[2], r, (mp_limb) value, brec->big.base); + udiv_qrnnd (t[0], t[1], x1hi, x1lo, brec->big.base); +#endif + n = 3; + } + else + { +#if (UDIV_TIME > 2 * UMUL_TIME) + mp_limb x; + + value <<= brec->big.normalization_steps; + udiv_qrnnd_preinv (t[0], x, (mp_limb) (value >> 32), + (mp_limb) value, big_base_norm, + brec->big.base_ninv); + t[1] = x >> brec->big.normalization_steps; +#elif UDIV_NEEDS_NORMALIZATION + mp_limb x; + + value <<= big_normalization_steps; + udiv_qrnnd (t[0], x, (mp_limb) (value >> 32), + (mp_limb) value, big_base_norm); + t[1] = x >> big_normalization_steps; +#else + udiv_qrnnd (t[0], t[1], (mp_limb) (value >> 32), + (mp_limb) value, brec->big.base); +#endif + n = 2; + } + } + else + { + t[0] = value; + n = 1; + } + + /* Convert the 1-3 words in t[], word by word, to ASCII. */ + do + { + mp_limb ti = t[--n]; + int ndig_for_this_limb = 0; + +#if UDIV_TIME > 2 * UMUL_TIME + mp_limb base_multiplier = brec->base_multiplier; + if (brec->flag) + while (ti != 0) + { + mp_limb quo, rem, x, dummy; + + umul_ppmm (x, dummy, ti, base_multiplier); + quo = (x + ((ti - x) >> 1)) >> (brec->post_shift - 1); + rem = ti - quo * base; + *--bp = digits[rem]; + ti = quo; + ++ndig_for_this_limb; + } + else + while (ti != 0) + { + mp_limb quo, rem, x, dummy; + + umul_ppmm (x, dummy, ti, base_multiplier); + quo = x >> brec->post_shift; + rem = ti - quo * base; + *--bp = digits[rem]; + ti = quo; + ++ndig_for_this_limb; + } +#else + while (ti != 0) + { + mp_limb quo, rem; + + quo = ti / base; + rem = ti % base; + *--bp = digits[rem]; + ti = quo; + ++ndig_for_this_limb; + } +#endif + /* If this wasn't the most significant word, pad with zeros. */ + if (n != 0) + while (ndig_for_this_limb < brec->big.ndigits) + { + *--bp = '0'; + ++ndig_for_this_limb; + } + } + while (n != 0); +#endif + } + break; + } + + return bp; +} diff --git a/stdio/_itoa.h b/stdio/_itoa.h index 8124050..ab3d1d1 100644 --- a/stdio/_itoa.h +++ b/stdio/_itoa.h @@ -21,8 +21,6 @@ Cambridge, MA 02139, USA. */ #define _ITOA_H #include -extern const char _itoa_lower_digits[], _itoa_upper_digits[]; - /* Convert VALUE into ASCII in base BASE (2..36). Write backwards starting the character just before BUFLIM. Return the address of the first (left-to-right) character in the number. @@ -31,28 +29,4 @@ extern const char _itoa_lower_digits[], _itoa_upper_digits[]; extern char *_itoa __P ((unsigned long long int value, char *buflim, unsigned int base, int upper_case)); -#ifndef _EXTERN_INLINE -#define _EXTERN_INLINE extern __inline -#endif - -_EXTERN_INLINE -char * -_itoa (unsigned long long int value, char *buflim, - unsigned int base, int upper_case) -{ - /* Base-36 digits for numbers. */ - const char *digits = upper_case ? _itoa_upper_digits : _itoa_lower_digits; - - register char *bp = buflim; - - while (value > 0) - { - *--bp = digits[value % base]; - value /= base; - } - - return bp; -} - - #endif /* itoa.h */ diff --git a/stdio/test_rdwr.c b/stdio/test_rdwr.c index 8e0c1df..f987f16 100644 --- a/stdio/test_rdwr.c +++ b/stdio/test_rdwr.c @@ -17,6 +17,7 @@ not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #include +#include #include #include #include diff --git a/stdlib/gmp-impl.h b/stdlib/gmp-impl.h index ccffe7b..48d3af9 100644 --- a/stdlib/gmp-impl.h +++ b/stdlib/gmp-impl.h @@ -19,11 +19,17 @@ along with the GNU MP Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #if ! defined (alloca) -#if defined (__GNUC__) || defined (__sparc__) || defined (sparc) +#if defined (__GNUC__) #define alloca __builtin_alloca #endif #endif +#if ! defined (alloca) +#if defined (__sparc__) || defined (sparc) || defined (__sgi) +#include +#endif +#endif + #ifndef NULL #define NULL 0L #endif @@ -168,6 +174,7 @@ void _mp_default_free (); else \ ____mpn_sqr_n (prodp, up, size, tspace); \ } while (0); +#define assert(trueval) do {if (!(trueval)) abort ();} while (0) /* Structure for conversion between internal binary format and strings in base 2..36. */ @@ -197,9 +204,11 @@ struct bases extern const struct bases __mp_bases[]; extern mp_size_t __gmp_default_fp_limb_precision; -/* Divide the two-limb number in (NH,,NL) by D, with DI being a 32 bit - approximation to (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB). - Put the quotient in Q and the remainder in R. */ +/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest + limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB). + If this would yield overflow, DI should be the largest possible number + (i.e., only ones). For correct operation, the most significant bit of D + has to be set. Put the quotient in Q and the remainder in R. */ #define udiv_qrnnd_preinv(q, r, nh, nl, d, di) \ do { \ mp_limb _q, _ql, _r; \ @@ -226,6 +235,8 @@ extern mp_size_t __gmp_default_fp_limb_precision; (r) = _r; \ (q) = _q; \ } while (0) +/* Like udiv_qrnnd_preinv, but for for any value D. DNORM is D shifted left + so that its most significant bit is set. LGUP is ceil(log2(D)). */ #define udiv_qrnnd_preinv2gen(q, r, nh, nl, d, di, dnorm, lgup) \ do { \ mp_limb n2, n10, n1, nadj, q1; \ @@ -243,6 +254,8 @@ extern mp_size_t __gmp_default_fp_limb_precision; (r) = _xl + ((d) & _xh); \ (q) = _xh - q1; \ } while (0) +/* Exactly like udiv_qrnnd_preinv, but branch-free. It is not clear which + version to use. */ #define udiv_qrnnd_preinv2norm(q, r, nh, nl, d, di) \ do { \ mp_limb n2, n10, n1, nadj, q1; \ @@ -262,22 +275,49 @@ extern mp_size_t __gmp_default_fp_limb_precision; } while (0) #if defined (__GNUC__) -/* Define stuff for longlong.h asm macros. */ -#if __GNUC_NEW_ATTR_MODE_SYNTAX -typedef unsigned int UQItype __attribute__ ((mode ("QI"))); -typedef int SItype __attribute__ ((mode ("SI"))); -typedef unsigned int USItype __attribute__ ((mode ("SI"))); -typedef int DItype __attribute__ ((mode ("DI"))); -typedef unsigned int UDItype __attribute__ ((mode ("DI"))); -#else +/* Define stuff for longlong.h. */ typedef unsigned int UQItype __attribute__ ((mode (QI))); typedef int SItype __attribute__ ((mode (SI))); typedef unsigned int USItype __attribute__ ((mode (SI))); typedef int DItype __attribute__ ((mode (DI))); typedef unsigned int UDItype __attribute__ ((mode (DI))); -#endif +#else +typedef unsigned char UQItype; +typedef long SItype; +typedef unsigned long USItype; #endif typedef mp_limb UWtype; typedef unsigned int UHWtype; #define W_TYPE_SIZE BITS_PER_MP_LIMB + + +#ifndef IEEE_DOUBLE_BIG_ENDIAN +#define IEEE_DOUBLE_BIG_ENDIAN 1 +#endif + +#if IEEE_DOUBLE_BIG_ENDIAN +union ieee_double_extract +{ + struct + { + unsigned long sig:1; + unsigned long exp:11; + unsigned long manh:20; + unsigned long manl:32; + } s; + double d; +}; +#else +union ieee_double_extract +{ + struct + { + unsigned long manl:32; + unsigned long manh:20; + unsigned long exp:11; + unsigned long sig:1; + } s; + double d; +}; +#endif diff --git a/stdlib/gmp.h b/stdlib/gmp.h index 95c2f1b..0b2cb29 100644 --- a/stdlib/gmp.h +++ b/stdlib/gmp.h @@ -24,13 +24,13 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define __need_size_t #include -#ifdef __STDC__ +#if defined (__STDC__) #define __gmp_const const #else #define __gmp_const #endif -#ifdef __GNUC__ +#if defined (__GNUC__) #define __gmp_inline inline #else #define __gmp_inline @@ -40,9 +40,14 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ typedef unsigned int mp_limb; typedef int mp_limb_signed; #else +#if _LONG_LONG_LIMB +typedef unsigned long long int mp_limb; +typedef long long int mp_limb_signed; +#else typedef unsigned long int mp_limb; typedef long int mp_limb_signed; #endif +#endif typedef mp_limb * mp_ptr; typedef __gmp_const mp_limb * mp_srcptr; @@ -52,9 +57,9 @@ typedef long int mp_exp_t; #ifndef __MP_SMALL__ typedef struct { - long int alloc; /* Number of *limbs* allocated and pointed + mp_size_t alloc; /* Number of *limbs* allocated and pointed to by the D field. */ - long int size; /* abs(SIZE) is the number of limbs + mp_size_t size; /* abs(SIZE) is the number of limbs the last field points to. If SIZE is negative this is a negative number. */ @@ -130,12 +135,16 @@ typedef __mpf_struct *mpf_ptr; typedef __gmp_const __mpq_struct *mpq_srcptr; typedef __mpq_struct *mpq_ptr; -#ifdef __STDC__ +#if defined (__STDC__) #define _PROTO(x) x #else #define _PROTO(x) () #endif +#if defined (FILE) || defined (_STDIO_H_) || defined (__STDIO_H__) || defined (H_STDIO) +#define _GMP_H_HAVE_FILE 1 +#endif + void mp_set_memory_functions _PROTO((void *(*) (size_t), void *(*) (void *, size_t, size_t), void (*) (void *, size_t))); @@ -165,7 +174,7 @@ unsigned long int mpz_get_ui _PROTO ((mpz_srcptr)); mp_limb mpz_getlimbn _PROTO ((mpz_srcptr, mp_size_t)); mp_size_t mpz_hamdist _PROTO ((mpz_srcptr, mpz_srcptr)); void mpz_init _PROTO ((mpz_ptr)); -#ifdef FILE +#ifdef _GMP_H_HAVE_FILE void mpz_inp_raw _PROTO ((mpz_ptr, FILE *)); int mpz_inp_str _PROTO ((mpz_ptr, FILE *, int)); #endif @@ -180,7 +189,7 @@ void mpz_mul _PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr)); void mpz_mul_2exp _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int)); void mpz_mul_ui _PROTO ((mpz_ptr, mpz_srcptr, unsigned long int)); void mpz_neg _PROTO ((mpz_ptr, mpz_srcptr)); -#ifdef FILE +#ifdef _GMP_H_HAVE_FILE void mpz_out_raw _PROTO ((FILE *, mpz_srcptr)); void mpz_out_str _PROTO ((FILE *, int, mpz_srcptr)); #endif @@ -218,6 +227,8 @@ void mpz_tdiv_qr_ui _PROTO((mpz_ptr, mpz_ptr, mpz_srcptr, unsigned long int)); void mpz_tdiv_r _PROTO((mpz_ptr, mpz_srcptr, mpz_srcptr)); void mpz_tdiv_r_ui _PROTO((mpz_ptr, mpz_srcptr, unsigned long int)); +void mpz_array_init (mpz_ptr, size_t, mp_size_t); + /**************** Rational (i.e. Q) routines. ****************/ void mpq_init _PROTO ((mpq_ptr)); @@ -253,7 +264,7 @@ void mpf_dump _PROTO ((mpf_srcptr)); char *mpf_get_str _PROTO ((char *, mp_exp_t *, int, size_t, mpf_srcptr)); void mpf_init _PROTO ((mpf_ptr)); void mpf_init2 _PROTO ((mpf_ptr, mp_size_t)); -#ifdef FILE +#ifdef _GMP_H_HAVE_FILE void mpf_inp_str _PROTO ((mpf_ptr, FILE *, int)); #endif void mpf_init_set _PROTO ((mpf_ptr, mpf_srcptr)); @@ -265,7 +276,7 @@ void mpf_mul _PROTO ((mpf_ptr, mpf_srcptr, mpf_srcptr)); void mpf_mul_2exp _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int)); void mpf_mul_ui _PROTO ((mpf_ptr, mpf_srcptr, unsigned long int)); void mpf_neg _PROTO ((mpf_ptr, mpf_srcptr)); -#ifdef FILE +#ifdef _GMP_H_HAVE_FILE void mpf_out_str _PROTO ((mpf_ptr, int, size_t, FILE *)); #endif void mpf_set _PROTO ((mpf_ptr, mpf_srcptr)); @@ -335,7 +346,7 @@ mp_limb __mpn_gcd_1 _PROTO ((mp_srcptr, mp_size_t, mp_limb)); static __gmp_inline mp_limb -#if __STDC__ +#if defined (__STDC__) __mpn_add_1 (register mp_ptr res_ptr, register mp_srcptr s1_ptr, register mp_size_t s1_size, @@ -377,7 +388,7 @@ __mpn_add_1 (res_ptr, s1_ptr, s1_size, s2_limb) } static __gmp_inline mp_limb -#if __STDC__ +#if defined (__STDC__) __mpn_add (register mp_ptr res_ptr, register mp_srcptr s1_ptr, register mp_size_t s1_size, @@ -406,7 +417,7 @@ __mpn_add (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size) } static __gmp_inline mp_limb -#if __STDC__ +#if defined (__STDC__) __mpn_sub_1 (register mp_ptr res_ptr, register mp_srcptr s1_ptr, register mp_size_t s1_size, @@ -448,7 +459,7 @@ __mpn_sub_1 (res_ptr, s1_ptr, s1_size, s2_limb) } static __gmp_inline mp_limb -#if __STDC__ +#if defined (__STDC__) __mpn_sub (register mp_ptr res_ptr, register mp_srcptr s1_ptr, register mp_size_t s1_size, @@ -477,7 +488,7 @@ __mpn_sub (res_ptr, s1_ptr, s1_size, s2_ptr, s2_size) } static __gmp_inline mp_size_t -#if __STDC__ +#if defined (__STDC__) __mpn_normal_size (mp_srcptr ptr, mp_size_t size) #else __mpn_normal_size (ptr, size) @@ -512,7 +523,6 @@ __mpn_normal_size (ptr, size) /* Useful synonyms, but not quite compatible with GMP 1. */ #define mpz_div mpz_fdiv_q #define mpz_divmod mpz_fdiv_qr -#define mpz_mod mpz_fdiv_r #define mpz_div_ui mpz_fdiv_q_ui #define mpz_divmod_ui mpz_fdiv_qr_ui #define mpz_mod_ui mpz_fdiv_r_ui diff --git a/stdlib/longlong.h b/stdlib/longlong.h index 97c469d..bbb92e3 100644 --- a/stdlib/longlong.h +++ b/stdlib/longlong.h @@ -97,7 +97,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ #define __AND_CLOBBER_CC , "cc" #endif /* __GNUC__ < 2 */ -#if (defined (__a29k__) || defined (___AM29K__)) && W_TYPE_SIZE == 32 +#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add %1,%4,%5 addc %0,%2,%3" \ @@ -152,6 +152,7 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ (pl) = __m0 * __m1; \ } while (0) #define UMUL_TIME 46 +#ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { UDItype __r; \ (q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \ @@ -159,12 +160,13 @@ the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ } while (0) extern UDItype __udiv_qrnnd (); #define UDIV_TIME 220 -#endif +#endif /* LONGLONG_STANDALONE */ +#endif /* __alpha__ */ #if defined (__arm__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ - __asm__ ("adds %1,%4,%5 - adc %0,%2,%3" \ + __asm__ ("adds %1, %4, %5 + adc %0, %2, %3" \ : "=r" ((USItype)(sh)), \ "=&r" ((USItype)(sl)) \ : "%r" ((USItype)(ah)), \ @@ -172,8 +174,8 @@ extern UDItype __udiv_qrnnd (); "%r" ((USItype)(al)), \ "rI" ((USItype)(bl))) #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ - __asm__ ("subs %1,%4,%5 - sbc %0,%2,%3" \ + __asm__ ("subs %1, %4, %5 + sbc %0, %2, %3" \ : "=r" ((USItype)(sh)), \ "=&r" ((USItype)(sl)) \ : "r" ((USItype)(ah)), \ @@ -181,19 +183,19 @@ extern UDItype __udiv_qrnnd (); "r" ((USItype)(al)), \ "rI" ((USItype)(bl))) #define umul_ppmm(xh, xl, a, b) \ - __asm__ ("; Inlined umul_ppmm - mov r0,%2 lsr 16 - mov r2,%3 lsr 16 - bic r1,%2,r0 lsl 16 - bic r2,%3,r2 lsl 16 - mul %1,r1,r2 - mul r2,r0,r2 - mul r1,%0,r1 - mul %0,r0,%0 - adds r1,r2,r1 - addcs %0,%0,0x10000 - adds %1,%1,r1 lsl 16 - adc %0,%0,r1 lsr 16" \ + __asm__ ("%@ Inlined umul_ppmm + mov %|r0, %2, lsr #16 + mov %|r2, %3, lsr #16 + bic %|r1, %2, %|r0, lsl #16 + bic %|r2, %3, %|r2, lsl #16 + mul %1, %|r1, %|r2 + mul %|r2, %|r0, %|r2 + mul %|r1, %0, %|r1 + mul %0, %|r0, %0 + adds %|r1, %|r2, %|r1 + addcs %0, %0, #65536 + adds %1, %1, %|r1, lsl #16 + adc %0, %0, %|r1, lsr #16" \ : "=&r" ((USItype)(xh)), \ "=r" ((USItype)(xl)) \ : "r" ((USItype)(a)), \ @@ -296,9 +298,9 @@ extern UDItype __udiv_qrnnd (); struct {USItype __h, __l;} __i; \ } __xx; \ __asm__ ("xmpyu %1,%2,%0" \ - : "=x" (__xx.__ll) \ - : "x" ((USItype)(u)), \ - "x" ((USItype)(v))); \ + : "=fx" (__xx.__ll) \ + : "fx" ((USItype)(u)), \ + "fx" ((USItype)(v))); \ (wh) = __xx.__i.__h; \ (wl) = __xx.__i.__l; \ } while (0) @@ -308,12 +310,14 @@ extern UDItype __udiv_qrnnd (); #define UMUL_TIME 40 #define UDIV_TIME 80 #endif +#ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { USItype __r; \ (q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \ (r) = __r; \ } while (0) extern USItype __udiv_qrnnd (); +#endif /* LONGLONG_STANDALONE */ #define count_leading_zeros(count, x) \ do { \ USItype __tmp; \ @@ -419,8 +423,12 @@ extern USItype __udiv_qrnnd (); } while (0) #define count_trailing_zeros(count, x) \ __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))) +#ifndef UMUL_TIME #define UMUL_TIME 40 +#endif +#ifndef UDIV_TIME #define UDIV_TIME 40 +#endif #endif /* 80x86 */ #if defined (__i960__) && W_TYPE_SIZE == 32 @@ -442,7 +450,7 @@ extern USItype __udiv_qrnnd (); __w; }) #endif /* __i960__ */ -#if defined (__mc68000__) && W_TYPE_SIZE == 32 +#if (defined (__mc68000__) || defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("add%.l %5,%1 addx%.l %3,%0" \ @@ -489,38 +497,34 @@ extern USItype __udiv_qrnnd (); : "=d" ((USItype)(count)) \ : "od" ((USItype)(x)), "n" (0)) #else /* not mc68020 */ -#define umul_ppmm(xh, xl, a, b) \ - __asm__ ("| Inlined umul_ppmm - move%.l %2,%/d0 - move%.l %3,%/d1 - move%.l %/d0,%/d2 - swap %/d0 - move%.l %/d1,%/d3 - swap %/d1 - move%.w %/d2,%/d4 - mulu %/d3,%/d4 - mulu %/d1,%/d2 - mulu %/d0,%/d3 - mulu %/d0,%/d1 - move%.l %/d4,%/d0 - eor%.w %/d0,%/d0 - swap %/d0 - add%.l %/d0,%/d2 - add%.l %/d3,%/d2 +#define umul_ppmmxx(xh, xl, a, b) \ + do { USItype __umul_tmp1, __umul_tmp2; \ + __asm__ ("| Inlined umul_ppmm + move%.l %5,%3 + move%.l %2,%0 + move%.w %3,%1 + swap %3 + swap %0 + mulu %2,%1 + mulu %3,%0 + mulu %2,%3 + swap %2 + mulu %5,%2 + add%.l %3,%2 jcc 1f - add%.l #65536,%/d1 -1: swap %/d2 - moveq #0,%/d0 - move%.w %/d2,%/d0 - move%.w %/d4,%/d2 - move%.l %/d2,%1 - add%.l %/d1,%/d0 - move%.l %/d0,%0" \ - : "=g" ((USItype)(xh)), \ - "=g" ((USItype)(xl)) \ - : "g" ((USItype)(a)), \ - "g" ((USItype)(b)) \ - : "d0", "d1", "d2", "d3", "d4") + add%.l %#0x10000,%0 +1: move%.l %2,%3 + clr%.w %2 + swap %2 + swap %3 + clr%.w %3 + add%.l %3,%1 + addx%.l %2,%0 + | End inlined umul_ppmm" \ + : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)), \ + "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ + : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ + } while (0) #define UMUL_TIME 100 #define UDIV_TIME 400 #endif /* not mc68020 */ @@ -553,7 +557,7 @@ extern USItype __udiv_qrnnd (); : "r" ((USItype)(x))); \ (count) = __cbtmp ^ 31; \ } while (0) -#if defined (__mc88110__) +#if defined (__m88110__) #define umul_ppmm(wh, wl, u, v) \ do { \ union {UDItype __ll; \ @@ -582,10 +586,18 @@ extern USItype __udiv_qrnnd (); #else #define UMUL_TIME 17 #define UDIV_TIME 150 -#endif /* __mc88110__ */ +#endif /* __m88110__ */ #endif /* __m88000__ */ #if defined (__mips__) && W_TYPE_SIZE == 32 +#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("multu %2,%3" \ + : "=l" ((USItype)(w0)), \ + "=h" ((USItype)(w1)) \ + : "d" ((USItype)(u)), \ + "d" ((USItype)(v))) +#else #define umul_ppmm(w1, w0, u, v) \ __asm__ ("multu %2,%3 mflo %0 @@ -594,11 +606,20 @@ extern USItype __udiv_qrnnd (); "=d" ((USItype)(w1)) \ : "d" ((USItype)(u)), \ "d" ((USItype)(v))) +#endif #define UMUL_TIME 10 #define UDIV_TIME 100 #endif /* __mips__ */ #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 +#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("dmultu %2,%3" \ + : "=l" ((UDItype)(w0)), \ + "=h" ((UDItype)(w1)) \ + : "d" ((UDItype)(u)), \ + "d" ((UDItype)(v))) +#else #define umul_ppmm(w1, w0, u, v) \ __asm__ ("dmultu %2,%3 mflo %0 @@ -607,8 +628,9 @@ extern USItype __udiv_qrnnd (); "=d" ((UDItype)(w1)) \ : "d" ((UDItype)(u)), \ "d" ((UDItype)(v))) -#define UMUL_TIME 10 -#define UDIV_TIME 100 +#endif +#define UMUL_TIME 20 +#define UDIV_TIME 140 #endif /* __mips__ */ #if defined (__ns32000__) && W_TYPE_SIZE == 32 @@ -647,7 +669,7 @@ extern USItype __udiv_qrnnd (); } while (0) #endif /* __ns32000__ */ -#if (defined (__powerpc__) || defined (___IBMR2__)) && W_TYPE_SIZE == 32 +#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (bh) && (bh) == 0) \ @@ -676,14 +698,14 @@ extern USItype __udiv_qrnnd (); #define sub_ddmmss(sh, sl, ah, al, bh, bl) \ do { \ if (__builtin_constant_p (ah) && (ah) == 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ : "=r" ((USItype)(sh)), \ "=&r" ((USItype)(sl)) \ : "r" ((USItype)(bh)), \ "rI" ((USItype)(al)), \ "r" ((USItype)(bl))); \ else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0) \ - __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ : "=r" ((USItype)(sh)), \ "=&r" ((USItype)(sl)) \ : "r" ((USItype)(bh)), \ @@ -716,7 +738,7 @@ extern USItype __udiv_qrnnd (); __asm__ ("{cntlz|cntlzw} %0,%1" \ : "=r" ((USItype)(count)) \ : "r" ((USItype)(x))) -#if defined (__powerpc__) +#if defined (_ARCH_PPC) #define umul_ppmm(ph, pl, m0, m1) \ do { \ USItype __m0 = (m0), __m1 = (m1); \ @@ -785,16 +807,15 @@ extern USItype __udiv_qrnnd (); "g" ((USItype)(bh)), \ "1" ((USItype)(al)), \ "g" ((USItype)(bl))) -/* This insn doesn't work on ancient pyramids. */ +/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ #define umul_ppmm(w1, w0, u, v) \ ({union {UDItype __ll; \ struct {USItype __h, __l;} __i; \ } __xx; \ - __xx.__i.__l = u; \ - __asm__ ("uemul %3,%0" \ - : "=r" (__xx.__i.__h), \ - "=r" (__xx.__i.__l) \ - : "1" (__xx.__i.__l), \ + __asm__ ("movw %1,%R0 + uemul %2,%0" \ + : "=&r" (__xx.__ll) \ + : "g" ((USItype) (u)), \ "g" ((USItype)(v))); \ (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;}) #endif /* __pyr__ */ @@ -868,6 +889,20 @@ extern USItype __udiv_qrnnd (); } while (0) #endif +#if defined (__sh2__) && W_TYPE_SIZE == 32 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ( \ + "dmulu.l %2,%3 + sts macl,%1 + sts mach,%0" \ + : "=r" ((USItype)(w1)), \ + "=r" ((USItype)(w0)) \ + : "r" ((USItype)(u)), \ + "r" ((USItype)(v)) \ + : "macl", "mach") +#define UMUL_TIME 5 +#endif + #if defined (__sparc__) && W_TYPE_SIZE == 32 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \ __asm__ ("addcc %r4,%5,%1 @@ -901,17 +936,21 @@ extern USItype __udiv_qrnnd (); : "r" ((USItype)(u)), \ "r" ((USItype)(v))) #define UMUL_TIME 5 -/* We might want to leave this undefined for `SuperSPARC (tm)' since - its implementation is crippled and often traps. */ +#ifndef SUPERSPARC /* SuperSPARC's udiv only handles 53 bit dividends */ #define udiv_qrnnd(q, r, n1, n0, d) \ - __asm__ ("mov %2,%%y;nop;nop;nop;udiv %3,%4,%0;umul %0,%4,%1;sub %3,%1,%1"\ - : "=&r" ((USItype)(q)), \ - "=&r" ((USItype)(r)) \ - : "r" ((USItype)(n1)), \ - "r" ((USItype)(n0)), \ - "r" ((USItype)(d))) + do { \ + USItype __q; \ + __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ + : "=r" ((USItype)(__q)) \ + : "r" ((USItype)(n1)), \ + "r" ((USItype)(n0)), \ + "r" ((USItype)(d))); \ + (r) = (n0) - __q * (d); \ + (q) = __q; \ + } while (0) #define UDIV_TIME 25 -#else +#endif /* SUPERSPARC */ +#else /* ! __sparc_v8__ */ #if defined (__sparclite__) /* This has hardware multiply but not divide. It also has two additional instructions scan (ffs from high bit) and divscc. */ @@ -973,9 +1012,10 @@ extern USItype __udiv_qrnnd (); __asm__ ("scan %1,0,%0" \ : "=r" ((USItype)(x)) \ : "r" ((USItype)(count))) -#else -/* SPARC without integer multiplication and divide instructions. - (i.e. at least Sun4/20,40,60,65,75,110,260,280,330,360,380,470,490) */ +#endif /* __sparclite__ */ +#endif /* __sparc_v8__ */ +/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ +#ifndef umul_ppmm #define umul_ppmm(w1, w0, u, v) \ __asm__ ("! Inlined umul_ppmm wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr @@ -1023,6 +1063,9 @@ extern USItype __udiv_qrnnd (); "r" ((USItype)(v)) \ : "%g1", "%g2" __AND_CLOBBER_CC) #define UMUL_TIME 39 /* 39 instructions */ +#endif +#ifndef udiv_qrnnd +#ifndef LONGLONG_STANDALONE #define udiv_qrnnd(q, r, n1, n0, d) \ do { USItype __r; \ (q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \ @@ -1030,8 +1073,8 @@ extern USItype __udiv_qrnnd (); } while (0) extern USItype __udiv_qrnnd (); #define UDIV_TIME 140 -#endif /* __sparclite__ */ -#endif /* __sparc_v8__ */ +#endif /* LONGLONG_STANDALONE */ +#endif /* udiv_qrnnd */ #endif /* __sparc__ */ #if defined (__vax__) && W_TYPE_SIZE == 32 @@ -1075,7 +1118,7 @@ extern USItype __udiv_qrnnd (); __xx.__i.__h = n1; __xx.__i.__l = n0; \ __asm__ ("ediv %3,%2,%0,%1" \ : "=g" (q), "=g" (r) \ - : "g" (__n1n0.ll), "g" (d)); \ + : "g" (__xx.ll), "g" (d)); \ } while (0) #endif /* __vax__ */ @@ -1173,11 +1216,12 @@ extern USItype __udiv_qrnnd (); do { \ UWtype __x0, __x1, __x2, __x3; \ UHWtype __ul, __vl, __uh, __vh; \ + UWtype __u = (u), __v = (v); \ \ - __ul = __ll_lowpart (u); \ - __uh = __ll_highpart (u); \ - __vl = __ll_lowpart (v); \ - __vh = __ll_highpart (v); \ + __ul = __ll_lowpart (__u); \ + __uh = __ll_highpart (__u); \ + __vl = __ll_lowpart (__v); \ + __vh = __ll_highpart (__v); \ \ __x0 = (UWtype) __ul * __vl; \ __x1 = (UWtype) __ul * __vh; \ @@ -1194,6 +1238,17 @@ extern USItype __udiv_qrnnd (); } while (0) #endif +#if !defined (umul_ppmm) +#define smul_ppmm(w1, w0, u, v) \ + do { \ + UWtype __w1; \ + UWtype __m0 = (u), __m1 = (v); \ + umul_ppmm (__w1, w0, __m0, __m1); \ + (w1) = __w1 - (-(__m0 >> (W_TYPE_SIZE - 1)) & __m1) \ + - (-(__m1 >> (W_TYPE_SIZE - 1)) & __m0); \ + } while (0) +#endif + /* Define this unconditionally, so it can be used for debugging. */ #define __udiv_qrnnd_c(q, r, n1, n0, d) \ do { \ diff --git a/sysdeps/alpha/add_n.s b/sysdeps/alpha/add_n.s new file mode 100644 index 0000000..e1ad460 --- /dev/null +++ b/sysdeps/alpha/add_n.s @@ -0,0 +1,119 @@ + # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + addq $5,$6,$6 + cmpult $6,$5,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_add_n diff --git a/sysdeps/alpha/addmul_1.s b/sysdeps/alpha/addmul_1.s new file mode 100644 index 0000000..46d277d --- /dev/null +++ b/sysdeps/alpha/addmul_1.s @@ -0,0 +1,100 @@ + # Alpha 21064 __mpn_addmul_1 -- Multiply a limb vector with a limb and add + # the result to a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on the 21064. + + # To improve performance for long multiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Alpha + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 2 +__mpn_addmul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + addq $5,$3,$3 + cmpult $3,$5,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: addq $5,$3,$3 + cmpult $3,$5,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end __mpn_addmul_1 diff --git a/sysdeps/alpha/alphaev5/add_n.s b/sysdeps/alpha/alphaev5/add_n.s new file mode 100644 index 0000000..2aaf041 --- /dev/null +++ b/sysdeps/alpha/alphaev5/add_n.s @@ -0,0 +1,118 @@ + # Alpha __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 4 +.Loop: subq $19,4,$19 + unop + + ldq $6,8($18) + addq $4,$0,$0 + ldq $5,8($17) + cmpult $0,$4,$1 + ldq $4,16($18) + addq $3,$0,$20 + cmpult $20,$3,$0 + ldq $3,16($17) + or $0,$1,$0 + addq $6,$0,$0 + cmpult $0,$6,$1 + ldq $6,24($18) + addq $5,$0,$21 + cmpult $21,$5,$0 + ldq $5,24($17) + or $0,$1,$0 + addq $4,$0,$0 + cmpult $0,$4,$1 + ldq $4,32($18) + addq $3,$0,$22 + cmpult $22,$3,$0 + ldq $3,32($17) + or $0,$1,$0 + addq $6,$0,$0 + cmpult $0,$6,$1 + addq $5,$0,$23 + cmpult $23,$5,$0 + or $0,$1,$0 + + stq $20,0($16) + stq $21,8($16) + stq $22,16($16) + stq $23,24($16) + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + addq $3,$4,$4 + cmpult $4,$3,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_add_n diff --git a/sysdeps/alpha/alphaev5/lshift.s b/sysdeps/alpha/alphaev5/lshift.s new file mode 100644 index 0000000..fdb0895 --- /dev/null +++ b/sysdeps/alpha/alphaev5/lshift.s @@ -0,0 +1,175 @@ + # Alpha EV5 __mpn_lshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.25 cycles/limb on the EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $31,$19,$20 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + srl $4,$20,$0 # compute function result + + beq $28,L0 + subq $18,$28,$18 + + .align 3 +Loop0: ldq $3,-16($17) + subq $16,8,$16 + sll $4,$19,$5 + subq $17,8,$17 + subq $28,1,$28 + srl $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stq $8,0($16) + bne $28,Loop0 + +L0: sll $4,$19,$24 + beq $18,Lend + # warm up phase 1 + ldq $1,-16($17) + subq $18,4,$18 + ldq $2,-24($17) + ldq $3,-32($17) + ldq $4,-40($17) + beq $18,Lcool1 + # warm up phase 2 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + ldq $1,-48($17) + sll $2,$19,$22 + ldq $2,-56($17) + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + ldq $3,-64($17) + sll $4,$19,$24 + ldq $4,-72($17) + subq $18,4,$18 + beq $18,Lcool1 + .align 4 + # main loop +Loop: stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + + srl $1,$20,$7 + subq $18,4,$18 + sll $1,$19,$21 + unop # ldq $31,-96($17) + + srl $2,$20,$8 + ldq $1,-80($17) + sll $2,$19,$22 + ldq $2,-88($17) + + stq $5,-24($16) + or $7,$24,$7 + stq $6,-32($16) + or $8,$21,$8 + + srl $3,$20,$5 + unop # ldq $31,-96($17) + sll $3,$19,$23 + subq $16,32,$16 + + srl $4,$20,$6 + ldq $3,-96($17 + sll $4,$19,$24 + ldq $4,-104($17) + + subq $17,32,$17 + bne $18,Loop + unop + unop + # cool down phase 2/1 +Lcool1: stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + stq $5,-24($16) + or $7,$24,$7 + stq $6,-32($16) + or $8,$21,$8 + srl $3,$20,$5 + sll $3,$19,$23 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 2/2 + stq $7,-40($16) + or $5,$22,$5 + stq $8,-48($16) + or $6,$23,$6 + stq $5,-56($16) + stq $6,-64($16) + # cool down phase 2/3 + stq $24,-72($16) + ret $31,($26),1 + + # cool down phase 1/1 +Lcool1: srl $1,$20,$7 + sll $1,$19,$21 + srl $2,$20,$8 + sll $2,$19,$22 + srl $3,$20,$5 + or $7,$24,$7 + sll $3,$19,$23 + or $8,$21,$8 + srl $4,$20,$6 + sll $4,$19,$24 + # cool down phase 1/2 + stq $7,-8($16) + or $5,$22,$5 + stq $8,-16($16) + or $6,$23,$6 + stq $5,-24($16) + stq $6,-32($16) + stq $24,-40($16) + ret $31,($26),1 + +Lend stq $24,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/alpha/alphaev5/rshift.s b/sysdeps/alpha/alphaev5/rshift.s new file mode 100644 index 0000000..1da9960 --- /dev/null +++ b/sysdeps/alpha/alphaev5/rshift.s @@ -0,0 +1,173 @@ + # Alpha EV5 __mpn_rshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.25 cycles/limb on the EV5. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + subq $31,$19,$20 + subq $18,1,$18 + and $18,4-1,$28 # number of limbs in first loop + sll $4,$20,$0 # compute function result + + beq $28,L0 + subq $18,$28,$18 + + .align 3 +Loop0: ldq $3,8($17) + addq $16,8,$16 + srl $4,$19,$5 + addq $17,8,$17 + subq $28,1,$28 + sll $3,$20,$6 + or $3,$3,$4 + or $5,$6,$8 + stq $8,-8($16) + bne $28,Loop0 + +L0: srl $4,$19,$24 + beq $18,Lend + # warm up phase 1 + ldq $1,8($17) + subq $18,4,$18 + ldq $2,16($17) + ldq $3,24($17) + ldq $4,32($17) + beq $18,Lcool1 + # warm up phase 2 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + ldq $1,40($17) + srl $2,$19,$22 + ldq $2,48($17) + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + ldq $3,56($17) + srl $4,$19,$24 + ldq $4,64($17) + subq $18,4,$18 + beq $18,Lcool2 + .align 4 + # main loop +Loop: stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + + sll $1,$20,$7 + subq $18,4,$18 + srl $1,$19,$21 + unop # ldq $31,-96($17) + + sll $2,$20,$8 + ldq $1,72($17) + srl $2,$19,$22 + ldq $2,80($17) + + stq $5,16($16) + or $7,$24,$7 + stq $6,24($16) + or $8,$21,$8 + + sll $3,$20,$5 + unop # ldq $31,-96($17) + srl $3,$19,$23 + addq $16,32,$16 + + sll $4,$20,$6 + ldq $3,88($17) + srl $4,$19,$24 + ldq $4,96($17) + + addq $17,32,$17 + bne $18,Loop + unop + unop + # cool down phase 2/1 +Lcool2: stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + stq $5,16($16) + or $7,$24,$7 + stq $6,24($16) + or $8,$21,$8 + sll $3,$20,$5 + srl $3,$19,$23 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 2/2 + stq $7,32($16) + or $5,$22,$5 + stq $8,40($16) + or $6,$23,$6 + stq $5,48($16) + stq $6,56($16) + # cool down phase 2/3 + stq $24,64($16) + ret $31,($26),1 + + # cool down phase 1/1 +Lcool1: sll $1,$20,$7 + srl $1,$19,$21 + sll $2,$20,$8 + srl $2,$19,$22 + sll $3,$20,$5 + or $7,$24,$7 + srl $3,$19,$23 + or $8,$21,$8 + sll $4,$20,$6 + srl $4,$19,$24 + # cool down phase 1/2 + stq $7,0($16) + or $5,$22,$5 + stq $8,8($16) + or $6,$23,$6 + stq $5,16($16) + stq $6,24($16) + stq $24,32($16) + ret $31,($26),1 + +Lend: stq $24,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/alpha/lshift.s b/sysdeps/alpha/lshift.s new file mode 100644 index 0000000..c284349 --- /dev/null +++ b/sysdeps/alpha/lshift.s @@ -0,0 +1,108 @@ + # Alpha 21064 __mpn_lshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldq and stq can be paired with the other used + # instructions. But there are many restrictions in the 21064 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .frame $30,0,$26,0 + + s8addq $18,$17,$17 # make r17 point at end of s1 + ldq $4,-8($17) # load first limb + subq $17,8,$17 + subq $31,$19,$7 + s8addq $18,$16,$16 # make r16 point at end of RES + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + srl $4,$7,$0 # compute function result + + beq $20,L0 + subq $18,$20,$18 + + .align 3 +Loop0: + ldq $3,-8($17) + subq $16,8,$16 + subq $17,8,$17 + subq $20,1,$20 + sll $4,$19,$5 + srl $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,0($16) + bne $20,Loop0 + +L0: beq $18,Lend + + .align 3 +Loop: ldq $3,-8($17) + subq $16,32,$16 + subq $18,4,$18 + sll $4,$19,$5 + srl $3,$7,$6 + + ldq $4,-16($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,24($16) + srl $4,$7,$2 + + ldq $3,-24($17) + sll $4,$19,$5 + bis $1,$2,$8 + stq $8,16($16) + srl $3,$7,$6 + + ldq $4,-32($17) + sll $3,$19,$1 + bis $5,$6,$8 + stq $8,8($16) + srl $4,$7,$2 + + subq $17,32,$17 + bis $1,$2,$8 + stq $8,0($16) + + bgt $18,Loop + +Lend: sll $4,$19,$8 + stq $8,-8($16) + ret $31,($26),1 + .end __mpn_lshift diff --git a/sysdeps/alpha/mul_1.s b/sysdeps/alpha/mul_1.s new file mode 100644 index 0000000..3ef194d --- /dev/null +++ b/sysdeps/alpha/mul_1.s @@ -0,0 +1,84 @@ + # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store + # the result in a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5. + + # To improve performance for long multiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Alpha + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_mul_1 + .ent __mpn_mul_1 2 +__mpn_mul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + bic $31,$31,$4 # clear cy_limb + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,8($17) # $2 = s1_limb + subq $18,1,$18 # size-- + stq $3,0($16) + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,16($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + stq $3,8($16) + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + addq $16,8,$16 # res_ptr++ + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + stq $3,8($16) + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: stq $3,0($16) + ret $31,($26),1 + + .end __mpn_mul_1 diff --git a/sysdeps/alpha/rshift.s b/sysdeps/alpha/rshift.s new file mode 100644 index 0000000..74eab04 --- /dev/null +++ b/sysdeps/alpha/rshift.s @@ -0,0 +1,106 @@ + # Alpha 21064 __mpn_rshift -- + + # Copyright (C) 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # cnt r19 + + # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling, + # it would take 4 cycles/limb. It should be possible to get down to 3 + # cycles/limb since both ldq and stq can be paired with the other used + # instructions. But there are many restrictions in the 21064 pipeline that + # makes it hard, if not impossible, to get down to 3 cycles/limb: + + # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay. + # 2. Only aligned instruction pairs can be paired. + # 3. The store buffer or silo might not be able to deal with the bandwidth. + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .frame $30,0,$26,0 + + ldq $4,0($17) # load first limb + addq $17,8,$17 + subq $31,$19,$7 + subq $18,1,$18 + and $18,4-1,$20 # number of limbs in first loop + sll $4,$7,$0 # compute function result + + beq $20,L0 + subq $18,$20,$18 + + .align 3 +Loop0: + ldq $3,0($17) + addq $16,8,$16 + addq $17,8,$17 + subq $20,1,$20 + srl $4,$19,$5 + sll $3,$7,$6 + bis $3,$3,$4 + bis $5,$6,$8 + stq $8,-8($16) + bne $20,Loop0 + +L0: beq $18,Lend + + .align 3 +Loop: ldq $3,0($17) + addq $16,32,$16 + subq $18,4,$18 + srl $4,$19,$5 + sll $3,$7,$6 + + ldq $4,8($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-32($16) + sll $4,$7,$2 + + ldq $3,16($17) + srl $4,$19,$5 + bis $1,$2,$8 + stq $8,-24($16) + sll $3,$7,$6 + + ldq $4,24($17) + srl $3,$19,$1 + bis $5,$6,$8 + stq $8,-16($16) + sll $4,$7,$2 + + addq $17,32,$17 + bis $1,$2,$8 + stq $8,-8($16) + + bgt $18,Loop + +Lend: srl $4,$19,$8 + stq $8,0($16) + ret $31,($26),1 + .end __mpn_rshift diff --git a/sysdeps/alpha/sub_n.s b/sysdeps/alpha/sub_n.s new file mode 100644 index 0000000..5200025 --- /dev/null +++ b/sysdeps/alpha/sub_n.s @@ -0,0 +1,119 @@ + # Alpha __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $16 + # s1_ptr $17 + # s2_ptr $18 + # size $19 + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .frame $30,0,$26,0 + + ldq $3,0($17) + ldq $4,0($18) + + subq $19,1,$19 + and $19,4-1,$2 # number of limbs in first loop + bis $31,$31,$0 + beq $2,.L0 # if multiple of 4 limbs, skip first loop + + subq $19,$2,$19 + +.Loop0: subq $2,1,$2 + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + addq $17,8,$17 + addq $18,8,$18 + bis $5,$5,$3 + bis $6,$6,$4 + addq $16,8,$16 + bne $2,.Loop0 + +.L0: beq $19,.Lend + + .align 3 +.Loop: subq $19,4,$19 + + ldq $5,8($17) + addq $4,$0,$4 + ldq $6,8($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + + ldq $3,16($17) + addq $6,$0,$6 + ldq $4,16($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,8($16) + or $0,$1,$0 + + ldq $5,24($17) + addq $4,$0,$4 + ldq $6,24($18) + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,16($16) + or $0,$1,$0 + + ldq $3,32($17) + addq $6,$0,$6 + ldq $4,32($18) + cmpult $6,$0,$1 + subq $5,$6,$6 + cmpult $5,$6,$0 + stq $6,24($16) + or $0,$1,$0 + + addq $17,32,$17 + addq $18,32,$18 + addq $16,32,$16 + bne $19,.Loop + +.Lend: addq $4,$0,$4 + cmpult $4,$0,$1 + subq $3,$4,$4 + cmpult $3,$4,$0 + stq $4,0($16) + or $0,$1,$0 + ret $31,($26),1 + + .end __mpn_sub_n diff --git a/sysdeps/alpha/submul_1.s b/sysdeps/alpha/submul_1.s new file mode 100644 index 0000000..acaa11c --- /dev/null +++ b/sysdeps/alpha/submul_1.s @@ -0,0 +1,100 @@ + # Alpha 21064 __mpn_submul_1 -- Multiply a limb vector with a limb and + # subtract the result from a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr r16 + # s1_ptr r17 + # size r18 + # s2_limb r19 + + # This code runs at 42 cycles/limb on the 21064. + + # To improve performance for long multiplications, we would use + # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use + # these instructions without slowing down the general code: 1. We can + # only have two prefetches in operation at any time in the Alpha + # architecture. 2. There will seldom be any special alignment + # between RES_PTR and S1_PTR. Maybe we can simply divide the current + # loop into an inner and outer loop, having the inner loop handle + # exactly one prefetch block? + + .set noreorder + .set noat +.text + .align 3 + .globl __mpn_submul_1 + .ent __mpn_submul_1 2 +__mpn_submul_1: + .frame $30,0,$26 + + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + umulh $2,$19,$0 # $0 = prod_high + beq $18,Lend1 # jump if size was == 1 + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + subq $18,1,$18 # size-- + subq $5,$3,$3 + cmpult $5,$3,$4 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + beq $18,Lend2 # jump if size was == 2 + + .align 3 +Loop: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + subq $18,1,$18 # size-- + umulh $2,$19,$4 # $4 = cy_limb + ldq $2,0($17) # $2 = s1_limb + addq $17,8,$17 # s1_ptr++ + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $16,8,$16 # res_ptr++ + addq $5,$0,$0 # combine carries + bne $18,Loop + +Lend2: mulq $2,$19,$3 # $3 = prod_low + ldq $5,0($16) # $5 = *res_ptr + addq $4,$0,$0 # cy_limb = cy_limb + 'cy' + umulh $2,$19,$4 # $4 = cy_limb + addq $3,$0,$3 # $3 = cy_limb + prod_low + cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low) + subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $5,$0,$0 # combine carries + addq $4,$0,$0 # cy_limb = prod_high + cy + ret $31,($26),1 +Lend1: subq $5,$3,$3 + cmpult $5,$3,$5 + stq $3,0($16) + addq $0,$5,$0 + ret $31,($26),1 + + .end __mpn_submul_1 diff --git a/sysdeps/alpha/udiv_qrnnd.S b/sysdeps/alpha/udiv_qrnnd.S index 942d7a8..bafafd6 100644 --- a/sysdeps/alpha/udiv_qrnnd.S +++ b/sysdeps/alpha/udiv_qrnnd.S @@ -134,7 +134,7 @@ Loop2: cmplt n0,0,tmp ret $31,($26),1 Odd: - /* q' in n0. r' in n1. */ + /* q' in n0. r' in n1 */ addq n1,n0,n1 cmpult n1,n0,tmp # tmp := carry from addq beq tmp,LLp6 diff --git a/sysdeps/generic/divmod_1.c b/sysdeps/generic/divmod_1.c index d156eeb..2989d36 100644 --- a/sysdeps/generic/divmod_1.c +++ b/sysdeps/generic/divmod_1.c @@ -83,14 +83,12 @@ __mpn_divmod_1 (quot_ptr, dividend_ptr, dividend_size, divisor_limb) result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the most significant bit (with weight 2**N) implicit. */ -#if 0 /* This can't happen when normalization_steps != 0 */ /* Special case for DIVISOR_LIMB == 100...000. */ if (divisor_limb << 1 == 0) divisor_limb_inverted = ~(mp_limb) 0; else -#endif - udiv_qrnnd (divisor_limb_inverted, dummy, - -divisor_limb, 0, divisor_limb); + udiv_qrnnd (divisor_limb_inverted, dummy, + -divisor_limb, 0, divisor_limb); n1 = dividend_ptr[dividend_size - 1]; r = n1 >> (BITS_PER_MP_LIMB - normalization_steps); diff --git a/sysdeps/generic/mod_1.c b/sysdeps/generic/mod_1.c index ae4ed09..8a49fb4 100644 --- a/sysdeps/generic/mod_1.c +++ b/sysdeps/generic/mod_1.c @@ -3,8 +3,6 @@ Return the single-limb remainder. There are no constraints on the value of the divisor. - QUOT_PTR and DIVIDEND_PTR might point to the same limb. - Copyright (C) 1991, 1993, 1994, Free Software Foundation, Inc. This file is part of the GNU MP Library. diff --git a/sysdeps/hppa/add_n.s b/sysdeps/hppa/add_n.s new file mode 100644 index 0000000..7f3e323 --- /dev/null +++ b/sysdeps/hppa/add_n.s @@ -0,0 +1,57 @@ +; HP-PA __mpn_add_n -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; One might want to unroll this as for other processors, but it turns +; out that the data cache contention after a store makes such +; unrolling useless. We can't come under 5 cycles/limb anyway. + + .code + .export __mpn_add_n +__mpn_add_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L$end ; check for (SIZE == 1) + add %r20,%r19,%r28 ; add first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L$loop + addc %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + bv 0(%r2) + addc %r0,%r0,%r28 + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/addmul_1.s b/sysdeps/hppa/hppa1.1/addmul_1.s new file mode 100644 index 0000000..a9dfdd1 --- /dev/null +++ b/sysdeps/hppa/hppa1.1/addmul_1.s @@ -0,0 +1,101 @@ +; HP-PA-1.1 __mpn_addmul_1 -- Multiply a limb vector with a limb and +; add the result to a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 11 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 10 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + + .code + .export __mpn_addmul_1 +__mpn_addmul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + add %r29,%r19,%r19 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + add %r29,%r1,%r19 + stw %r19,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/mul_1.s b/sysdeps/hppa/hppa1.1/mul_1.s new file mode 100644 index 0000000..ebf0778 --- /dev/null +++ b/sysdeps/hppa/hppa1.1/mul_1.s @@ -0,0 +1,97 @@ +; HP-PA-1.1 __mpn_mul_1 -- Multiply a limb vector with a limb and store +; the result in a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 9 cycles/limb on a PA7000. With the used instructions, it can +; not become faster due to data cache contention after a store. On the +; PA7100 it runs at 7 cycles/limb, and that can not be improved either, since +; only the xmpyu does not need the integer pipeline, so the only dual-issue +; we will get are addc+xmpyu. Unrolling would not help either CPU. + +; We could use fldds to read two limbs at a time from the S1 array, and that +; could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and +; PA7100, respectively. We don't do that since it does not seem worth the +; (alignment) troubles... + +; At least the PA7100 is rumored to be able to deal with cache-misses +; without stalling instruction issue. If this is true, and the cache is +; actually also lockup-free, we should use a deeper software pipeline, and +; load from S1 very early! (The loads and stores to -12(sp) will surely be +; in the cache.) + + .code + .export __mpn_mul_1 +__mpn_mul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop fldws,ma 4(%r25),%fr5 + stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end stws,ma %r19,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + stws,ma %r19,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + fstds %fr6,-16(%r30) + ldw -16(%r30),%r28 + ldo -64(%r30),%r30 + bv 0(%r2) + fstws %fr6R,0(%r26) + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/submul_1.s b/sysdeps/hppa/hppa1.1/submul_1.s new file mode 100644 index 0000000..44cabf4 --- /dev/null +++ b/sysdeps/hppa/hppa1.1/submul_1.s @@ -0,0 +1,110 @@ +; HP-PA-1.1 __mpn_submul_1 -- Multiply a limb vector with a limb and +; subtract the result from a second limb vector. + +; Copyright (C) 1992, 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r26 +; s1_ptr r25 +; size r24 +; s2_limb r23 + +; This runs at 12 cycles/limb on a PA7000. With the used instructions, it +; can not become faster due to data cache contention after a store. On the +; PA7100 it runs at 11 cycles/limb, and that can not be improved either, +; since only the xmpyu does not need the integer pipeline, so the only +; dual-issue we will get are addc+xmpyu. Unrolling could gain a cycle/limb +; on the PA7100. + +; There are some ideas described in mul_1.s that applies to this code too. + +; It seems possible to make this run as fast as __mpn_addmul_1, if we use +; sub,>>= %r29,%r19,%r22 +; addi 1,%r28,%r28 +; but that requires reworking the hairy software pipeline... + + .code + .export __mpn_submul_1 +__mpn_submul_1 + .proc + .callinfo frame=64,no_calls + .entry + + ldo 64(%r30),%r30 + fldws,ma 4(%r25),%fr5 + stw %r23,-16(%r30) ; move s2_limb ... + addib,= -1,%r24,L$just_one_limb + fldws -16(%r30),%fr4 ; ... into fr4 + add %r0,%r0,%r0 ; clear carry + xmpyu %fr4,%fr5,%fr6 + fldws,ma 4(%r25),%fr7 + fstds %fr6,-16(%r30) + xmpyu %fr4,%fr7,%fr8 + ldw -12(%r30),%r19 ; least significant limb in product + ldw -16(%r30),%r28 + + fstds %fr8,-16(%r30) + addib,= -1,%r24,L$end + ldw -12(%r30),%r1 + +; Main loop +L$loop ldws 0(%r26),%r29 + fldws,ma 4(%r25),%fr5 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + xmpyu %fr4,%fr5,%fr6 + ldw -16(%r30),%r28 + fstds %fr6,-16(%r30) + addc %r0,%r28,%r28 + addib,<> -1,%r24,L$loop + ldw -12(%r30),%r1 + +L$end ldw 0(%r26),%r29 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r28,%r1,%r19 + ldw -16(%r30),%r28 + ldws 0(%r26),%r29 + addc %r0,%r28,%r28 + sub %r29,%r19,%r22 + add %r22,%r19,%r0 + stws,ma %r22,4(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + +L$just_one_limb + xmpyu %fr4,%fr5,%fr6 + ldw 0(%r26),%r29 + fstds %fr6,-16(%r30) + ldw -12(%r30),%r1 + ldw -16(%r30),%r28 + sub %r29,%r1,%r22 + add %r22,%r1,%r0 + stw %r22,0(%r26) + addc %r0,%r28,%r28 + bv 0(%r2) + ldo -64(%r30),%r30 + + .exit + .procend diff --git a/sysdeps/hppa/hppa1.1/udiv_qrnnd.s b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s new file mode 100644 index 0000000..4ffef3a --- /dev/null +++ b/sysdeps/hppa/hppa1.1/udiv_qrnnd.s @@ -0,0 +1,74 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on PA 7000 and later. + +; Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + + .code +L$0000 .word 0x43f00000 + .word 0x0 + .export __udiv_qrnnd +__udiv_qrnnd + .proc + .callinfo frame=64,no_calls + .entry + ldo 64(%r30),%r30 + + stws %r25,-16(0,%r30) ; n_hi + stws %r24,-12(0,%r30) ; n_lo + ldil L'L$0000,%r19 + ldo R'L$0000(%r19),%r19 + fldds -16(0,%r30),%fr5 + stws %r23,-12(0,%r30) + comib,<= 0,%r25,L$1 + fcnvxf,dbl,dbl %fr5,%fr5 + fldds 0(0,%r19),%fr4 + fadd,dbl %fr4,%fr5,%fr5 +L$1 + fcpy,sgl %fr0,%fr6L + fldws -12(0,%r30),%fr6R + fcnvxf,dbl,dbl %fr6,%fr4 + + fdiv,dbl %fr5,%fr4,%fr5 + + fcnvfx,dbl,dbl %fr5,%fr4 + fstws %fr4R,-16(%r30) + xmpyu %fr4R,%fr6R,%fr6 + ldws -16(%r30),%r28 + fstds %fr6,-16(0,%r30) + ldws -12(0,%r30),%r21 + ldws -16(0,%r30),%r20 + sub %r24,%r21,%r22 + subb %r25,%r20,%r19 + comib,= 0,%r19,L$2 + ldo -64(%r30),%r30 + + add %r22,%r23,%r22 + ldo -1(%r28),%r28 +L$2 bv 0(%r2) + stws %r22,0(0,%r26) + + .exit + .procend diff --git a/sysdeps/hppa/lshift.s b/sysdeps/hppa/lshift.s new file mode 100644 index 0000000..0479f4a --- /dev/null +++ b/sysdeps/hppa/lshift.s @@ -0,0 +1,65 @@ +; HP-PA __mpn_lshift -- + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __mpn_lshift +__mpn_lshift + .proc + .callinfo frame=64,no_calls + .entry + + sh2add %r24,%r25,%r25 + sh2add %r24,%r26,%r26 + ldws,mb -4(0,%r25),%r22 + subi 32,%r23,%r1 + mtsar %r1 + addib,= -1,%r24,L$0004 + vshd %r0,%r22,%r28 ; compute carry out limb + ldws,mb -4(0,%r25),%r29 + addib,= -1,%r24,L$0002 + vshd %r22,%r29,%r20 + +L$loop ldws,mb -4(0,%r25),%r22 + stws,mb %r20,-4(0,%r26) + addib,= -1,%r24,L$0003 + vshd %r29,%r22,%r20 + ldws,mb -4(0,%r25),%r29 + stws,mb %r20,-4(0,%r26) + addib,<> -1,%r24,L$loop + vshd %r22,%r29,%r20 + +L$0002 stws,mb %r20,-4(0,%r26) + vshd %r29,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) +L$0003 stws,mb %r20,-4(0,%r26) +L$0004 vshd %r22,%r0,%r20 + bv 0(%r2) + stw %r20,-4(0,%r26) + + .exit + .procend diff --git a/sysdeps/hppa/rshift.s b/sysdeps/hppa/rshift.s new file mode 100644 index 0000000..18d33f2 --- /dev/null +++ b/sysdeps/hppa/rshift.s @@ -0,0 +1,62 @@ +; HP-PA __mpn_rshift -- + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s_ptr gr25 +; size gr24 +; cnt gr23 + + .code + .export __mpn_rshift +__mpn_rshift + .proc + .callinfo frame=64,no_calls + .entry + + ldws,ma 4(0,%r25),%r22 + mtsar %r23 + addib,= -1,%r24,L$0004 + vshd %r22,%r0,%r28 ; compute carry out limb + ldws,ma 4(0,%r25),%r29 + addib,= -1,%r24,L$0002 + vshd %r29,%r22,%r20 + +L$loop ldws,ma 4(0,%r25),%r22 + stws,ma %r20,4(0,%r26) + addib,= -1,%r24,L$0003 + vshd %r22,%r29,%r20 + ldws,ma 4(0,%r25),%r29 + stws,ma %r20,4(0,%r26) + addib,<> -1,%r24,L$loop + vshd %r29,%r22,%r20 + +L$0002 stws,ma %r20,4(0,%r26) + vshd %r0,%r29,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) +L$0003 stws,ma %r20,4(0,%r26) +L$0004 vshd %r0,%r22,%r20 + bv 0(%r2) + stw %r20,0(0,%r26) + + .exit + .procend diff --git a/sysdeps/hppa/sub_n.s b/sysdeps/hppa/sub_n.s new file mode 100644 index 0000000..daae46e --- /dev/null +++ b/sysdeps/hppa/sub_n.s @@ -0,0 +1,58 @@ +; HP-PA __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr gr26 +; s1_ptr gr25 +; s2_ptr gr24 +; size gr23 + +; One might want to unroll this as for other processors, but it turns +; out that the data cache contention after a store makes such +; unrolling useless. We can't come under 5 cycles/limb anyway. + + .code + .export __mpn_sub_n +__mpn_sub_n + .proc + .callinfo frame=0,no_calls + .entry + + ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + + addib,= -1,%r23,L$end ; check for (SIZE == 1) + sub %r20,%r19,%r28 ; subtract first limbs ignoring cy + +L$loop ldws,ma 4(0,%r25),%r20 + ldws,ma 4(0,%r24),%r19 + stws,ma %r28,4(0,%r26) + addib,<> -1,%r23,L$loop + subb %r20,%r19,%r28 + +L$end stws %r28,0(0,%r26) + addc %r0,%r0,%r28 + bv 0(%r2) + subi 1,%r28,%r28 + + .exit + .procend diff --git a/sysdeps/hppa/udiv_qrnnd.s b/sysdeps/hppa/udiv_qrnnd.s new file mode 100644 index 0000000..0b069bf --- /dev/null +++ b/sysdeps/hppa/udiv_qrnnd.s @@ -0,0 +1,285 @@ +; HP-PA __udiv_qrnnd division support, used from longlong.h. +; This version runs fast on pre-PA7000 CPUs. + +; Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; rem_ptr gr26 +; n1 gr25 +; n0 gr24 +; d gr23 + +; The code size is a bit excessive. We could merge the last two ds;addc +; sequences by simply moving the "bb,< Odd" instruction down. The only +; trouble is the FFFFFFFF code that would need some hacking. + + .code + .export __udiv_qrnnd +__udiv_qrnnd + .proc + .callinfo frame=0,no_calls + .entry + + comb,< %r23,0,L$largedivisor + sub %r0,%r23,%r1 ; clear cy as side-effect + ds %r0,%r1,%r0 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r24 + ds %r25,%r23,%r25 + addc %r24,%r24,%r28 + ds %r25,%r23,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r23,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r28,%r28,%r28 + +L$largedivisor + extru %r24,31,1,%r19 ; r19 = n0 & 1 + bb,< %r23,31,L$odd + extru %r23,30,31,%r22 ; r22 = d >> 1 + shd %r25,%r24,1,%r24 ; r24 = new n0 + extru %r25,30,31,%r25 ; r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r24,%r24,%r28 + +L$odd addib,sv,n 1,%r22,L$FF.. ; r22 = (d / 2 + 1) + shd %r25,%r24,1,%r24 ; r24 = new n0 + extru %r25,30,31,%r25 ; r25 = new n1 + sub %r0,%r22,%r21 + ds %r0,%r21,%r0 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r24 + ds %r25,%r22,%r25 + addc %r24,%r24,%r28 + comclr,>= %r25,%r0,%r0 + addl %r25,%r22,%r25 + sh1addl %r25,%r19,%r25 +; We have computed (n1,,n0) / (d + 1), q' = r28, r' = r25 + add,nuv %r28,%r25,%r25 + addl %r25,%r1,%r25 + addc %r0,%r28,%r28 + sub,<< %r25,%r23,%r0 + addl %r25,%r1,%r25 + stws %r25,0(0,%r26) + bv 0(%r2) + addc %r0,%r28,%r28 + +; This is just a special case of the code above. +; We come here when d == 0xFFFFFFFF +L$FF.. add,uv %r25,%r24,%r24 + sub,<< %r24,%r23,%r0 + ldo 1(%r24),%r24 + stws %r24,0(0,%r26) + bv 0(%r2) + addc %r0,%r25,%r28 + + .exit + .procend diff --git a/sysdeps/i386/add_n.S b/sysdeps/i386/add_n.S index c4e71ea..c3b3c3e 100644 --- a/sysdeps/i386/add_n.S +++ b/sysdeps/i386/add_n.S @@ -1,7 +1,7 @@ /* i80386 __mpn_add_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_add_n:) subl %eax,%edx /* ... enter the loop */ shrl $2,%eax /* restore previous value */ #ifdef PIC - call here -here: leal (Loop - 3 - here)(%eax,%eax,8),%eax - addl %eax,(%esp) - ret +/* Calculate start address in loop for PIC. Due to limitations in some + assemblers, Loop-L0-3 cannot be put into the leal */ + call L0 +L0: leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $(Loop-L0-3),%eax + addl $4,%esp #else - leal (Loop - 3)(%eax,%eax,8),%eax /* calc start addr in loop */ - jmp *%eax /* jump into loop */ +/* Calculate start address in loop for non-PIC. */ + leal (Loop - 3)(%eax,%eax,8),%eax #endif + jmp *%eax /* jump into loop */ ALIGN (3) Loop: movl (%esi),%eax adcl (%edx),%eax diff --git a/sysdeps/i386/gmp-mparam.h b/sysdeps/i386/gmp-mparam.h new file mode 100644 index 0000000..687f12a --- /dev/null +++ b/sysdeps/i386/gmp-mparam.h @@ -0,0 +1,28 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +#define IEEE_DOUBLE_BIG_ENDIAN 0 diff --git a/sysdeps/i386/i486/strcat.S b/sysdeps/i386/i486/strcat.S new file mode 100644 index 0000000..e3d2181 --- /dev/null +++ b/sysdeps/i386/i486/strcat.S @@ -0,0 +1,260 @@ +/* strcat(dest, src) -- Append SRC on the end of DEST. +For Intel 80x86, x>=4. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper . +Optimised a little by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + dest (sp + 4) + src (sp + 8) +*/ + + .text +ENTRY (strcat) + pushl %edi /* Save callee-safe register. */ + + movl 12(%esp), %ecx /* load source pointer */ + movl 8(%esp), %edx /* load destination pointer */ + + testb $0xff, (%ecx) /* Is source string empty? */ + jz L8 /* yes => return */ + + /* Test the first bytes separately until destination is aligned. */ + testb $3, %edx /* destination pointer aligned? */ + jz L1 /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L2 /* yes => start appending */ + incl %edx /* increment source pointer */ + + testb $3, %edx /* destination pointer aligned? */ + jz L1 /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L2 /* yes => start appending */ + incl %edx /* increment source pointer */ + + testb $3, %edx /* destination pointer aligned? */ + jz L1 /* yes => begin scan loop */ + testb $0xff, (%edx) /* is end of string? */ + jz L2 /* yes => start appending */ + incl %edx /* increment source pointer */ + + /* Now we are aligned. Begin scan loop. */ + jmp L1 + + ALIGN(4) + +L4: addl $16,%edx /* increment destination pointer for round */ + +L1: movl (%edx), %eax /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + + /* If you compare this with the algorithm in memchr.S you will + notice that here is an `xorl' statement missing. But you must + not forget that we are looking for C == 0 and `xorl $0, %eax' + is a no-op. */ + + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + jnc L3 + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %ecx. */ + jnz L3 + + movl 4(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L5 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L5 /* one byte is NUL => stop copying */ + + movl 8(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L6 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L6 /* one byte is NUL => stop copying */ + + movl 12(%edx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L7 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L4 /* no byte is NUL => carry on copying */ + +L7: addl $4, %edx /* adjust source pointer */ +L6: addl $4, %edx +L5: addl $4, %edx + +L3: testb %al, %al /* is first byte NUL? */ + jz L2 /* yes => start copying */ + incl %edx /* increment source pointer */ + + testb %ah, %ah /* is second byte NUL? */ + jz L2 /* yes => start copying */ + incl %edx /* increment source pointer */ + + testl $0xff0000, %eax /* is third byte NUL? */ + jz L2 /* yes => start copying */ + incl %edx /* increment source pointer */ + +L2: subl %ecx, %edx /* reduce number of loop variants */ + + /* Now we have to align the source pointer. */ + testb $3, %ecx /* pointer correctly aligned? */ + jz L29 /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andl %al, %al /* is byte NUL? */ + jz L8 /* yes => return */ + incl %ecx /* increment pointer */ + + testb $3, %ecx /* pointer correctly aligned? */ + jz L29 /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andl %al, %al /* is byte NUL? */ + jz L8 /* yes => return */ + incl %ecx /* increment pointer */ + + testb $3, %ecx /* pointer correctly aligned? */ + jz L29 /* yes => start copy loop */ + movb (%ecx), %al /* get first byte */ + movb %al, (%ecx,%edx) /* and store it */ + andl %al, %al /* is byte NUL? */ + jz L8 /* yes => return */ + incl %ecx /* increment pointer */ + + /* Now we are aligned. */ + jmp L29 /* start copy loop */ + + ALIGN(4) + +L28: movl %eax, 12(%ecx,%edx)/* store word at destination */ + addl $16, %ecx /* adjust pointer for full round */ + +L29: movl (%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L9 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L9 /* one byte is NUL => stop copying */ + movl %eax, (%ecx,%edx) /* store word to destination */ + + movl 4(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L91 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L91 /* one byte is NUL => stop copying */ + movl %eax, 4(%ecx,%edx) /* store word to destination */ + + movl 8(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L92 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L92 /* one byte is NUL => stop copying */ + movl %eax, 8(%ecx,%edx) /* store word to destination */ + + movl 12(%ecx), %eax /* get word from source */ + movl $0xfefefeff, %edi /* magic value */ + addl %eax, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L93 /* highest byte is C => stop copying */ + xorl %eax, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L28 /* no is NUL => carry on copying */ + +L93: addl $4, %ecx /* adjust pointer */ +L92: addl $4, %ecx +L91: addl $4, %ecx + +L9: movb %al, (%ecx,%edx) /* store first byte of last word */ + orb %al, %al /* is it NUL? */ + jz L8 /* yes => return */ + + movb %ah, 1(%ecx,%edx) /* store second byte of last word */ + orb %ah, %ah /* is it NUL? */ + jz L8 /* yes => return */ + + shrl $16, %eax /* make upper bytes accessible */ + movb %al, 2(%ecx,%edx) /* store third byte of last word */ + orb %al, %al /* is it NUL? */ + jz L8 /* yes => return */ + + movb %ah, 3(%ecx,%edx) /* store fourth byte of last word */ + +L8: movl 8(%esp), %eax /* start address of destination is result */ + popl %edi /* restore saved register */ + + ret diff --git a/sysdeps/i386/i486/strlen.S b/sysdeps/i386/i486/strlen.S new file mode 100644 index 0000000..276563b --- /dev/null +++ b/sysdeps/i386/i486/strlen.S @@ -0,0 +1,132 @@ +/* strlen(str) -- determine the length of the string STR. +Optimized for Intel 80x86, x>=4. +Copyright (C) 1991, 1992, 1993, 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper . +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) +*/ + + .text +ENTRY (strlen) + movl 4(%esp), %ecx /* get string pointer */ + movl %ecx, %eax /* duplicate it */ + + andl $3, %ecx /* mask alignment bits */ + jz L1 /* aligned => start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + incl %eax /* increment pointer */ + + xorl $3, %ecx /* was alignment = 3? */ + jz L1 /* yes => now it is aligned and start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + addl $1, %eax /* increment pointer */ + + subl $1, %ecx /* was alignment = 2? */ + jz L1 /* yes => now it is aligned and start loop */ + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + +/* Don't change the above `addl $1,%eax' and `subl $1, %ecx' into `incl %eax' + and `decl %ecx' resp. The additional two byte per instruction make the + label 4 to be aligned on a 16 byte boundary with nops. + + The following `sub $15, %eax' is part of this trick, too. Together with + the next instruction (`addl $16, %eax') it is in fact a `incl %eax', just + as expected from the algorithm. But doing so has the advantage that + no jump to label 1 is necessary and so the pipeline is not flushed. */ + + subl $15, %eax /* effectively +1 */ + + +L4: addl $16, %eax /* adjust pointer for full loop */ + +L1: movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L3 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L3 /* found NUL => return pointer */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L5 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L5 /* found NUL => return pointer */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L6 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L6 /* found NUL => return pointer */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edx /* magic value */ + addl %ecx, %edx /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L7 /* highest byte is NUL => return pointer */ + xorl %ecx, %edx /* (word+magic)^word */ + orl $0xfefefeff, %edx /* set all non-carry bits */ + incl %edx /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L4 /* no NUL found => continue loop */ + +L7: addl $4, %eax /* adjust pointer */ +L6: addl $4, %eax +L5: addl $4, %eax + +L3: testb %cl, %cl /* is first byte NUL? */ + jz L2 /* yes => return */ + incl %eax /* increment pointer */ + + testb %ch, %ch /* is second byte NUL? */ + jz L2 /* yes => return */ + incl %eax /* increment pointer */ + + testl $0xff0000, %ecx /* is third byte NUL? */ + jz L2 /* yes => return pointer */ + incl %eax /* increment pointer */ + +L2: subl 4(%esp), %eax /* compute difference to string start */ + + ret diff --git a/sysdeps/i386/i586/Implies b/sysdeps/i386/i586/Implies new file mode 100644 index 0000000..477cd74 --- /dev/null +++ b/sysdeps/i386/i586/Implies @@ -0,0 +1,2 @@ +# Code optimized for i486 is better than simple i386 code. +i386/i486 diff --git a/sysdeps/i386/i586/add_n.S b/sysdeps/i386/i586/add_n.S new file mode 100644 index 0000000..9be45ed --- /dev/null +++ b/sysdeps/i386/i586/add_n.S @@ -0,0 +1,136 @@ +/* Pentium __mpn_add_n -- Add two limb vectors of the same length > 0 and store + sum in a third limb vector. + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 12) + size (sp + 16) +*/ + +#define r1 %eax +#define r2 %edx +#define src1 %esi +#define src2 %ebp +#define dst %edi +#define x %ebx + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_add_n) +C_SYMBOL_NAME(__mpn_add_n:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),dst /* res_ptr */ + movl 24(%esp),src1 /* s1_ptr */ + movl 28(%esp),src2 /* s2_ptr */ + movl 32(%esp),%ecx /* size */ + + movl (src2),x + + decl %ecx + movl %ecx,r2 + shrl $3,%ecx + andl $7,r2 + testl %ecx,%ecx /* zero carry flag */ + jz Lend + pushl r2 + + ALIGN (3) +Loop: movl 28(dst),%eax /* fetch destination cache line */ + leal 32(dst),dst + +L1: movl (src1),r1 + movl 4(src1),r2 + adcl x,r1 + movl 4(src2),x + adcl x,r2 + movl 8(src2),x + movl r1,-32(dst) + movl r2,-28(dst) + +L2: movl 8(src1),r1 + movl 12(src1),r2 + adcl x,r1 + movl 12(src2),x + adcl x,r2 + movl 16(src2),x + movl r1,-24(dst) + movl r2,-20(dst) + +L3: movl 16(src1),r1 + movl 20(src1),r2 + adcl x,r1 + movl 20(src2),x + adcl x,r2 + movl 24(src2),x + movl r1,-16(dst) + movl r2,-12(dst) + +L4: movl 24(src1),r1 + movl 28(src1),r2 + adcl x,r1 + movl 28(src2),x + adcl x,r2 + movl 32(src2),x + movl r1,-8(dst) + movl r2,-4(dst) + + leal 32(src1),src1 + leal 32(src2),src2 + decl %ecx + jnz Loop + + popl r2 +Lend: + decl r2 /* test r2 w/o clobbering carry */ + js Lend2 + incl r2 +Loop2: + leal 4(dst),dst + movl (src1),r1 + adcl x,r1 + movl 4(src2),x + movl r1,-4(dst) + leal 4(src1),src1 + leal 4(src2),src2 + decl r2 + jnz Loop2 +Lend2: + movl (src1),r1 + adcl x,r1 + movl r1,(dst) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/addmul_1.S b/sysdeps/i386/i586/addmul_1.S new file mode 100644 index 0000000..b222840 --- /dev/null +++ b/sysdeps/i386/i586/addmul_1.S @@ -0,0 +1,84 @@ +/* Pentium __mpn_addmul_1 -- Multiply a limb vector with a limb and add + the result to a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_addmul_1) + .type C_SYMBOL_NAME(__mpn_addmul_1),@function +C_SYMBOL_NAME(__mpn_addmul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(ecx),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,ecx,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,ecx,4)) + INSN1(neg,l ,R(ecx)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,ecx,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,ecx,4)) + + INSN2(adc,l ,R(edx),$0) + INSN2(add,l ,R(ebx),R(eax)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,ecx,4),R(ebx)) + + INSN1(inc,l ,R(ecx)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret +Lfe1: + .size C_SYMBOL_NAME(__mpn_addmul_1),Lfe1-C_SYMBOL_NAME(__mpn_addmul_1) diff --git a/sysdeps/i386/i586/lshift.S b/sysdeps/i386/i586/lshift.S new file mode 100644 index 0000000..b9f8131 --- /dev/null +++ b/sysdeps/i386/i586/lshift.S @@ -0,0 +1,213 @@ +/* Pentium optimized __mpn_lshift -- + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + size (sp + 12) + cnt (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_lshift) +C_SYMBOL_NAME(__mpn_lshift:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s_ptr */ + movl 28(%esp),%ebp /* size */ + movl 32(%esp),%ecx /* cnt */ + + cmp $1,%ecx + jne Lnormal + movl %edi,%eax + subl %esi,%eax + cmpl %ebp,%eax + jnc Lspecial + +Lnormal: + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + xorl %eax,%eax + shldl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz Lend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +Loop: movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + shldl %cl,%eax,%ebx + shldl %cl,%edx,%eax + movl %ebx,(%edi) + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + shldl %cl,%ebx,%edx + shldl %cl,%eax,%ebx + movl %edx,-8(%edi) + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + shldl %cl,%edx,%eax + shldl %cl,%ebx,%edx + movl %eax,-16(%edi) + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + shldl %cl,%eax,%ebx + shldl %cl,%edx,%eax + movl %ebx,-24(%edi) + movl %eax,-28(%edi) + + subl $32,%esi + subl $32,%edi + decl %ebp + jnz Loop + +Lend: popl %ebp + andl $7,%ebp + jz Lend2 +Loop2: movl (%esi),%eax + shldl %cl,%eax,%edx + movl %edx,(%edi) + movl %eax,%edx + subl $4,%esi + subl $4,%edi + decl %ebp + jnz Loop2 + +Lend2: shll %cl,%edx /* compute least significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + +Lspecial: + movl (%esi),%edx + addl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + addl %edx,%edx + incl %ebp + decl %ebp + jz LLend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +LLoop: movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + adcl %eax,%eax + movl %ebx,(%edi) + adcl %edx,%edx + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + adcl %ebx,%ebx + movl %edx,8(%edi) + adcl %eax,%eax + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + adcl %edx,%edx + movl %eax,16(%edi) + adcl %ebx,%ebx + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + adcl %eax,%eax + movl %ebx,24(%edi) + adcl %edx,%edx + movl %eax,28(%edi) + + leal 32(%esi),%esi /* use leal not to clobber carry */ + leal 32(%edi),%edi + decl %ebp + jnz LLoop + +LLend: popl %ebp + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebp + jz LLend2 + addl %eax,%eax /* restore carry from eax */ +LLoop2: movl %edx,%ebx + movl (%esi),%edx + adcl %edx,%edx + movl %ebx,(%edi) + + leal 4(%esi),%esi /* use leal not to clobber carry */ + leal 4(%edi),%edi + decl %ebp + jnz LLoop2 + + jmp LL1 +LLend2: addl %eax,%eax /* restore carry from eax */ +LL1: movl %edx,(%edi) /* store last limb */ + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/memcopy.h b/sysdeps/i386/i586/memcopy.h index a9bb9e7..0a87687 100644 --- a/sysdeps/i386/i586/memcopy.h +++ b/sysdeps/i386/i586/memcopy.h @@ -1,5 +1,5 @@ /* memcopy.h -- definitions for memory copy functions. Pentium version. - Copyright (C) 1994 Free Software Foundation, Inc. + Copyright (C) 1994, 1995 Free Software Foundation, Inc. Contributed by Torbjorn Granlund (tege@sics.se). This file is part of the GNU C Library. @@ -88,7 +88,7 @@ Cambridge, MA 02139, USA. */ "subl $32,%2\n" \ "jns 1b\n" \ "2: addl $32,%2" : \ - "=r" (dst_bp), "=r" (src_bp), "=r" (nbytes_left) : \ - "0" (dst_bp), "1" (src_bp), "2" (nbytes) : \ + "=r" (dst_ep), "=r" (src_ep), "=r" (nbytes_left) : \ + "0" (dst_ep), "1" (src_ep), "2" (nbytes) : \ "ax", "dx"); \ } while (0) diff --git a/sysdeps/i386/i586/mul_1.S b/sysdeps/i386/i586/mul_1.S new file mode 100644 index 0000000..2b7258e --- /dev/null +++ b/sysdeps/i386/i586/mul_1.S @@ -0,0 +1,78 @@ +/* Pentium __mpn_mul_1 -- Multiply a limb vector with a limb and store + the result in a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_mul_1) +C_SYMBOL_NAME(__mpn_mul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(size),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) + INSN1(neg,l ,R(size)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(eax)) + + INSN1(inc,l ,R(size)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret diff --git a/sysdeps/i386/i586/rshift.S b/sysdeps/i386/i586/rshift.S new file mode 100644 index 0000000..51cde8f --- /dev/null +++ b/sysdeps/i386/i586/rshift.S @@ -0,0 +1,213 @@ +/* Pentium optimized __mpn_rshift -- + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s_ptr (sp + 8) + size (sp + 12) + cnt (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_rshift) +C_SYMBOL_NAME(__mpn_rshift:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),%edi /* res_ptr */ + movl 24(%esp),%esi /* s_ptr */ + movl 28(%esp),%ebp /* size */ + movl 32(%esp),%ecx /* cnt */ + + cmp $1,%ecx + jne Lnormal + movl %edi,%eax + subl %esi,%eax + cmpl %ebp,%eax + jnc Lspecial + +Lnormal: + movl (%esi),%edx + addl $4,%esi + xorl %eax,%eax + shrdl %cl,%edx,%eax /* compute carry limb */ + pushl %eax /* push carry limb onto stack */ + + decl %ebp + pushl %ebp + shrl $3,%ebp + jz Lend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +Loop: movl 28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl 4(%esi),%edx + shrdl %cl,%eax,%ebx + shrdl %cl,%edx,%eax + movl %ebx,(%edi) + movl %eax,4(%edi) + + movl 8(%esi),%ebx + movl 12(%esi),%eax + shrdl %cl,%ebx,%edx + shrdl %cl,%eax,%ebx + movl %edx,8(%edi) + movl %ebx,12(%edi) + + movl 16(%esi),%edx + movl 20(%esi),%ebx + shrdl %cl,%edx,%eax + shrdl %cl,%ebx,%edx + movl %eax,16(%edi) + movl %edx,20(%edi) + + movl 24(%esi),%eax + movl 28(%esi),%edx + shrdl %cl,%eax,%ebx + shrdl %cl,%edx,%eax + movl %ebx,24(%edi) + movl %eax,28(%edi) + + addl $32,%esi + addl $32,%edi + decl %ebp + jnz Loop + +Lend: popl %ebp + andl $7,%ebp + jz Lend2 +Loop2: movl (%esi),%eax + shrdl %cl,%eax,%edx /* compute result limb */ + movl %edx,(%edi) + movl %eax,%edx + addl $4,%esi + addl $4,%edi + decl %ebp + jnz Loop2 + +Lend2: shrl %cl,%edx /* compute most significant limb */ + movl %edx,(%edi) /* store it */ + + popl %eax /* pop carry limb */ + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret + +/* We loop from least significant end of the arrays, which is only + permissable if the source and destination don't overlap, since the + function is documented to work for overlapping source and destination. +*/ + +Lspecial: + leal -4(%edi,%ebp,4),%edi + leal -4(%esi,%ebp,4),%esi + + movl (%esi),%edx + subl $4,%esi + + decl %ebp + pushl %ebp + shrl $3,%ebp + + shrl $1,%edx + incl %ebp + decl %ebp + jz LLend + + movl (%edi),%eax /* fetch destination cache line */ + + ALIGN (2) +LLoop: movl -28(%edi),%eax /* fetch destination cache line */ + movl %edx,%ebx + + movl (%esi),%eax + movl -4(%esi),%edx + rcrl $1,%eax + movl %ebx,(%edi) + rcrl $1,%edx + movl %eax,-4(%edi) + + movl -8(%esi),%ebx + movl -12(%esi),%eax + rcrl $1,%ebx + movl %edx,-8(%edi) + rcrl $1,%eax + movl %ebx,-12(%edi) + + movl -16(%esi),%edx + movl -20(%esi),%ebx + rcrl $1,%edx + movl %eax,-16(%edi) + rcrl $1,%ebx + movl %edx,-20(%edi) + + movl -24(%esi),%eax + movl -28(%esi),%edx + rcrl $1,%eax + movl %ebx,-24(%edi) + rcrl $1,%edx + movl %eax,-28(%edi) + + leal -32(%esi),%esi /* use leal not to clobber carry */ + leal -32(%edi),%edi + decl %ebp + jnz LLoop + +LLend: popl %ebp + sbbl %eax,%eax /* save carry in %eax */ + andl $7,%ebp + jz LLend2 + addl %eax,%eax /* restore carry from eax */ +LLoop2: movl %edx,%ebx + movl (%esi),%edx + rcrl $1,%edx + movl %ebx,(%edi) + + leal -4(%esi),%esi /* use leal not to clobber carry */ + leal -4(%edi),%edi + decl %ebp + jnz LLoop2 + + jmp LL1 +LLend2: addl %eax,%eax /* restore carry from eax */ +LL1: movl %edx,(%edi) /* store last limb */ + + movl $0,%eax + rcrl $1,%eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/strchr.S b/sysdeps/i386/i586/strchr.S new file mode 100644 index 0000000..982c80e --- /dev/null +++ b/sysdeps/i386/i586/strchr.S @@ -0,0 +1,334 @@ +/* strchr -- find character CH in a NUL terminated string. +Highly optimized version for ix85, x>=5. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. +Contributed by Ulrich Drepper, . + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to executs some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +/* + INPUT PARAMETERS: + str (sp + 4) + ch (sp + 8) +*/ + + .text +ENTRY (strchr) + pushl %edi /* Save callee-safe registers. */ + pushl %esi + + pushl %ebx + pushl %ebp + + movl 20(%esp), %eax /* get string pointer */ + movl 24(%esp), %edx /* get character we are looking for */ + + movl %eax, %edi /* duplicate string pointer for later */ + xorl %ecx, %ecx /* clear %ecx */ + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movb %dl, %cl /* we construct the lower half in %ecx */ + + shll $16, %edx /* now %edx is c|c|0|0 */ + movb %cl, %ch /* now %ecx is 0|0|c|c */ + + orl %ecx, %edx /* and finally c|c|c|c */ + andl $3, %edi /* mask alignment bits */ + + jz L11 /* alignment is 0 => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + cmp $3, %edi /* was alignment == 3? */ + + je L11 /* yes => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + cmp $2, %edi /* was alignment == 2? */ + + je L11 /* yes => start loop */ + + movb (%eax), %cl /* load single byte */ + cmpb %cl, %dl /* is byte == C? */ + + je L2 /* aligned => return pointer */ + + cmp $0, %cl /* is byte NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The following code is the preparation for the loop. The + four instruction up to `L1' will not be executed in the loop + because the same code is found at the end of the loop, but + there it is executed in parallel with other instructions. */ +L11: movl (%eax), %ecx + movl $magic, %ebp + + movl $magic, %edi + addl %ecx, %ebp + + /* The main loop: it looks complex and indeed it is. I would + love to say `it was hard to write, so it should he hard to + read' but I will give some more hints. To fully understand + this code you should first take a look at the i486 version. + The basic algorithm is the same, but here the code organized + in a way which permits to use both pipelines all the time. + + I tried to make it a bit more understandable by indenting + the code according to stage in the algorithm. It goes as + follows: + check for 0 in 1st word + check for C in 1st word + check for 0 in 2nd word + check for C in 2nd word + check for 0 in 3rd word + check for C in 3rd word + check for 0 in 4th word + check for C in 4th word + + Please note that doing the test for NUL before the test for + C allows us to overlap the test for 0 in the next word with + the test for C. */ + +L1: xorl %ecx, %ebp /* (word^magic) */ + addl %ecx, %edi /* add magic word */ + + leal 4(%eax), %eax /* increment pointer */ + jnc L4 /* previous addl caused overflow? */ + + movl %ecx, %ebx /* duplicate original word */ + orl $magic, %ebp /* (word^magic)|magic */ + + addl $1, %ebp /* (word^magic)|magic == 0xffffffff? */ + jne L4 /* yes => we found word with NUL */ + + movl $magic, %esi /* load magic value */ + xorl %edx, %ebx /* clear words which are C */ + + movl (%eax), %ecx + addl %ebx, %esi /* (word+magic) */ + + movl $magic, %edi + jnc L5 /* previous addl caused overflow? */ + + movl %edi, %ebp + xorl %ebx, %esi /* (word+magic)^word */ + + addl %ecx, %ebp + orl $magic, %esi /* ((word+magic)^word)|magic */ + + addl $1, %esi /* ((word+magic)^word)|magic==0xf..f?*/ + jne L5 /* yes => we found word with C */ + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L5 + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + jne L5 + + xorl %ecx, %ebp + addl %ecx, %edi + + leal 4(%eax), %eax + jnc L4 + + movl %ecx, %ebx + orl $magic, %ebp + + addl $1, %ebp + jne L4 + + movl $magic, %esi + xorl %edx, %ebx + + movl (%eax), %ecx + addl %ebx, %esi + + movl $magic, %edi + jnc L5 + + movl %edi, %ebp + xorl %ebx, %esi + + addl %ecx, %ebp + orl $magic, %esi + + addl $1, %esi + + je L1 + + /* We know there is no NUL byte but a C byte in the word. + %ebx contains NUL in this particular byte. */ +L5: subl $4, %eax /* adjust pointer */ + testb %bl, %bl /* first byte == C? */ + + jz L2 /* yes => return pointer */ + + incl %eax /* increment pointer */ + testb %bh, %bh /* second byte == C? */ + + jz L2 /* yes => return pointer */ + + shrl $16, %ebx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmp $0, %bl /* third byte == C */ + je L2 /* yes => return pointer */ + + incl %eax /* increment pointer */ + +L2: popl %ebp /* restore saved registers */ + popl %ebx + + popl %esi + popl %edi + + ret + + /* We know there is a NUL byte in the word. But we have to test + whether there is an C byte before it in the word. */ +L4: subl $4, %eax /* adjust pointer */ + cmpb %dl, %cl /* first byte == C? */ + + je L2 /* yes => return pointer */ + + cmpb $0, %cl /* first byte == NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + cmpb %dl, %ch /* second byte == C? */ + je L2 /* yes => return pointer */ + + cmpb $0, %ch /* second byte == NUL? */ + je L3 /* yes => return NULL */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb %dl, %cl /* third byte == C? */ + je L2 /* yes => return pointer */ + + cmpb $0, %cl /* third byte == NUL? */ + je L3 /* yes => return NULL */ + + incl %eax /* increment pointer */ + + /* The test four the fourth byte is necessary! */ + cmpb %dl, %ch /* fourth byte == C? */ + je L2 /* yes => return pointer */ + +L3: xorl %eax, %eax /* set return value = NULL */ + + popl %ebp /* restore saved registers */ + popl %ebx + + popl %esi + popl %edi + + ret + +#undef index +weak_alias (strchr, index) diff --git a/sysdeps/i386/i586/strlen.S b/sysdeps/i386/i586/strlen.S new file mode 100644 index 0000000..b807ed4 --- /dev/null +++ b/sysdeps/i386/i586/strlen.S @@ -0,0 +1,185 @@ +/* strlen -- Compute length og NUL terminated string. +Highly optimized version for ix86, x>=5. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. +Contributed by Ulrich Drepper, . + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include + +/* This version is especially optimized for the i586 (and following?) + processors. This is mainly done by using the two pipelines. The + version optimized for i486 is weak in this aspect because to get + as much parallelism we have to executs some *more* instructions. + + The code below is structured to reflect the pairing of the instructions + as *I think* it is. I have no processor data book to verify this. + If you find something you think is incorrect let me know. */ + + +/* The magic value which is used throughout in the whole code. */ +#define magic 0xfefefeff + +/* + INPUT PARAMETERS: + str (sp + 4) +*/ + + .text +ENTRY(strlen) + movl 4(%esp), %eax /* get string pointer */ + + movl %eax, %ecx /* duplicate it */ + andl $3, %ecx /* mask alignment bits */ + + jz L11 /* aligned => start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + cmpl $3, %ecx /* was alignment = 3? */ + + je L11 /* yes => now it is aligned and start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + cmpl $2, %ecx /* was alignment = 2? */ + + je L11 /* yes => now it is aligned and start loop */ + + cmpb %ch, (%eax) /* is byte NUL? */ + je L2 /* yes => return */ + + incl %eax /* increment pointer */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. */ +L11: xorl %edx, %edx /* We need %edx == 0 for later */ + +L1: + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* complete negation of word */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + jne L3 /* yes => determine byte */ + + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + addl $4, %eax /* adjust pointer for *next* word */ + + subl %ecx, %edx /* first step to negate word */ + addl $magic, %ecx /* add magic word */ + + decl %edx /* wcomplete negation of ord */ + jnc L3 /* previous addl caused overflow? */ + + xorl %ecx, %edx /* (word+magic)^word */ + subl $magic, %ecx /* undo previous addl to restore word */ + + andl $~magic, %edx /* any of the carry flags set? */ + + je L1 /* no => start loop again */ + + +L3: subl $4, %eax /* correct too early pointer increment */ + testb %cl, %cl /* lowest byte NUL? */ + + jz L2 /* yes => return */ + + inc %eax /* increment pointer */ + testb %ch, %ch /* second byte NUL? */ + + jz L2 /* yes => return */ + + shrl $16, %ecx /* make upper bytes accessible */ + incl %eax /* increment pointer */ + + cmpb $0, %cl /* is third byte NUL? */ + jz L2 /* yes => return */ + + incl %eax /* increment pointer */ + +L2: subl 4(%esp), %eax /* now compute the length as difference + between start and terminating NUL + character */ + + ret diff --git a/sysdeps/i386/i586/sub_n.S b/sysdeps/i386/i586/sub_n.S new file mode 100644 index 0000000..1382e66 --- /dev/null +++ b/sysdeps/i386/i586/sub_n.S @@ -0,0 +1,136 @@ +/* Pentium __mpn_sub_n -- Subtract two limb vectors of the same length > 0 + and store difference in a third limb vector. + +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + s2_ptr (sp + 12) + size (sp + 16) +*/ + +#define r1 %eax +#define r2 %edx +#define src1 %esi +#define src2 %ebp +#define dst %edi +#define x %ebx + +#include "sysdep.h" +#include "asm-syntax.h" + +.text + ALIGN (3) + .globl C_SYMBOL_NAME(__mpn_sub_n) +C_SYMBOL_NAME(__mpn_sub_n:) + pushl %edi + pushl %esi + pushl %ebx + pushl %ebp + + movl 20(%esp),dst /* res_ptr */ + movl 24(%esp),src1 /* s1_ptr */ + movl 28(%esp),src2 /* s2_ptr */ + movl 32(%esp),%ecx /* size */ + + movl (src2),x + + decl %ecx + movl %ecx,r2 + shrl $3,%ecx + andl $7,r2 + testl %ecx,%ecx /* zero carry flag */ + jz Lend + pushl r2 + + ALIGN (3) +Loop: movl 28(dst),%eax /* fetch destination cache line */ + leal 32(dst),dst + +L1: movl (src1),r1 + movl 4(src1),r2 + sbbl x,r1 + movl 4(src2),x + sbbl x,r2 + movl 8(src2),x + movl r1,-32(dst) + movl r2,-28(dst) + +L2: movl 8(src1),r1 + movl 12(src1),r2 + sbbl x,r1 + movl 12(src2),x + sbbl x,r2 + movl 16(src2),x + movl r1,-24(dst) + movl r2,-20(dst) + +L3: movl 16(src1),r1 + movl 20(src1),r2 + sbbl x,r1 + movl 20(src2),x + sbbl x,r2 + movl 24(src2),x + movl r1,-16(dst) + movl r2,-12(dst) + +L4: movl 24(src1),r1 + movl 28(src1),r2 + sbbl x,r1 + movl 28(src2),x + sbbl x,r2 + movl 32(src2),x + movl r1,-8(dst) + movl r2,-4(dst) + + leal 32(src1),src1 + leal 32(src2),src2 + decl %ecx + jnz Loop + + popl r2 +Lend: + decl r2 /* test r2 w/o clobbering carry */ + js Lend2 + incl r2 +Loop2: + leal 4(dst),dst + movl (src1),r1 + sbbl x,r1 + movl 4(src2),x + movl r1,-4(dst) + leal 4(src1),src1 + leal 4(src2),src2 + decl r2 + jnz Loop2 +Lend2: + movl (src1),r1 + sbbl x,r1 + movl r1,(dst) + + sbbl %eax,%eax + negl %eax + + popl %ebp + popl %ebx + popl %esi + popl %edi + ret diff --git a/sysdeps/i386/i586/submul_1.S b/sysdeps/i386/i586/submul_1.S new file mode 100644 index 0000000..14bfe54 --- /dev/null +++ b/sysdeps/i386/i586/submul_1.S @@ -0,0 +1,82 @@ +/* Pentium __mpn_submul_1 -- Multiply a limb vector with a limb and subtract + the result from a second limb vector. + +Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +/* + INPUT PARAMETERS + res_ptr (sp + 4) + s1_ptr (sp + 8) + size (sp + 12) + s2_limb (sp + 16) +*/ + +#include "sysdep.h" +#include "asm-syntax.h" + +#define res_ptr edi +#define s1_ptr esi +#define size ecx +#define s2_limb ebp + + TEXT + ALIGN (3) + GLOBL C_SYMBOL_NAME(__mpn_submul_1) +C_SYMBOL_NAME(__mpn_submul_1:) + + INSN1(push,l ,R(edi)) + INSN1(push,l ,R(esi)) + INSN1(push,l ,R(ebx)) + INSN1(push,l ,R(ebp)) + + INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) + INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) + INSN2(mov,l ,R(size),MEM_DISP(esp,28)) + INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) + + INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) + INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) + INSN1(neg,l ,R(size)) + INSN2(xor,l ,R(edx),R(edx)) + ALIGN (3) +Loop: + INSN2(mov,l ,R(ebx),R(edx)) + INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) + + INSN1(mul,l ,R(s2_limb)) + + INSN2(add,l ,R(eax),R(ebx)) + INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,size,4)) + + INSN2(adc,l ,R(edx),$0) + INSN2(sub,l ,R(ebx),R(eax)) + + INSN2(adc,l ,R(edx),$0) + INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(ebx)) + + INSN1(inc,l ,R(size)) + INSN1(jnz, ,Loop) + + + INSN2(mov,l ,R(eax),R(edx)) + INSN1(pop,l ,R(ebp)) + INSN1(pop,l ,R(ebx)) + INSN1(pop,l ,R(esi)) + INSN1(pop,l ,R(edi)) + ret diff --git a/sysdeps/i386/memchr.S b/sysdeps/i386/memchr.S new file mode 100644 index 0000000..9931f97 --- /dev/null +++ b/sysdeps/i386/memchr.S @@ -0,0 +1,315 @@ +/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less + than N. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Optimised a little by Alan Modra +This file is part of the GNU C Library. + +This version is developed using the same algorithm as the fast C +version which carries the following introduction: + +Based on strlen implemention by Torbjorn Granlund (tege@sics.se), +with help from Dan Sahlin (dan@sics.se) and +commentary by Jim Blandy (jimb@ai.mit.edu); +adaptation to memchr suggested by Dick Karpinski (dick@cca.ucsf.edu), +and implemented by Roland McGrath (roland@ai.mit.edu). + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + c (sp + 8) + len (sp + 12) +*/ + + .text +ENTRY (memchr) + /* Save callee-safe registers used in this function. */ + pushl %esi + pushl %edi + + /* Load parameters into registers. */ + movl 12(%esp), %eax /* str: pointer to memory block. */ + movl 16(%esp), %edx /* c: byte we are looking for. */ + movl 20(%esp), %esi /* len: length of memory block. */ + + /* If my must not test more than three characters test + them one by one. This is especially true for 0. */ + cmpl $4, %esi + jb L3 + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* Now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* Now c|c|0|0 */ + movw %cx, %dx /* And finally c|c|c|c */ + + /* Better performance can be achieved if the word (32 + bit) memory access is aligned on a four-byte-boundary. + So process first bytes one by one until boundary is + reached. Don't use a loop for better performance. */ + + testb $3, %eax /* correctly aligned ? */ + je L2 /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L9 /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + je L4 /* len==0 => return NULL */ + + testb $3, %eax /* correctly aligned ? */ + je L2 /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L9 /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + je L4 /* len==0 => return NULL */ + + testb $3, %eax /* correctly aligned ? */ + je L2 /* yes => begin loop */ + cmpb %dl, (%eax) /* compare byte */ + je L9 /* target found => return */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length counter */ + /* no test for len==0 here, because this is done in the + loop head */ + jmp L2 + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + + /* Each round the main loop processes 16 bytes. */ + + ALIGN (4) + +L1: movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + jnc L8 + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L8 /* found it => return pointer */ + + /* This process is unfolded four times for better performance. + we don't increment the source pointer each time. Instead we + use offsets and increment by 16 in each run of the loop. But + before probing for the matching byte we need some extra code + (following LL(13) below). Even the len can be compared with + constants instead of decrementing each time. */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L7 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L7 /* found it => return pointer */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L6 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L6 /* found it => return pointer */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L5 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L5 /* found it => return pointer */ + + /* Adjust both counters for a full round, i.e. 16 bytes. */ + addl $16, %eax +L2: subl $16, %esi + jae L1 /* Still more than 16 bytes remaining */ + + /* Process remaining bytes separately. */ + cmpl $4-16, %esi /* rest < 4 bytes? */ + jb L3 /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L8 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L8 /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + cmpl $8-16, %esi /* rest < 8 bytes? */ + jb L3 /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L8 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L8 /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + cmpl $12-16, %esi /* rest < 12 bytes? */ + jb L3 /* yes, than test byte by byte */ + + movl (%eax), %ecx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L8 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jne L8 /* found it => return pointer */ + addl $4, %eax /* adjust source pointer */ + + /* Check the remaining bytes one by one. */ +L3: andl $3, %esi /* mask out uninteresting bytes */ + jz L4 /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with C */ + je L9 /* equal, than return pointer */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length */ + jz L4 /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with C */ + je L9 /* equal, than return pointer */ + incl %eax /* increment source pointer */ + decl %esi /* decrement length */ + jz L4 /* no remaining bytes => return NULL */ + + cmpb %dl, (%eax) /* compare byte with C */ + je L9 /* equal, than return pointer */ + +L4: /* no byte found => return NULL */ + xorl %eax, %eax + jmp L9 + + /* add missing source pointer increments */ +L5: addl $4, %eax +L6: addl $4, %eax +L7: addl $4, %eax + + /* Test for the matching byte in the word. %ecx contains a NUL + char in the byte which originally was the byte we are looking + at. */ +L8: testb %cl, %cl /* test first byte in dword */ + jz L9 /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testb %ch, %ch /* test second byte in dword */ + jz L9 /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + testl $0xff0000, %ecx /* test third byte in dword */ + jz L9 /* if zero => return pointer */ + incl %eax /* increment source pointer */ + + /* No further test needed we we known it is one of the four byytes. */ + +L9: popl %edi /* pop saved registers */ + popl %esi + + ret diff --git a/sysdeps/i386/memchr.c b/sysdeps/i386/memchr.c deleted file mode 100644 index ff0f8d9..0000000 --- a/sysdeps/i386/memchr.c +++ /dev/null @@ -1,48 +0,0 @@ -/* memchr (str, ch, n) -- Return pointer to first occurrence of CH in STR less - than N. - For Intel 80x86, x>=3. - Copyright (C) 1991, 1992, 1993 Free Software Foundation, Inc. - Contributed by Torbjorn Granlund (tege@sics.se). - -The GNU C Library is free software; you can redistribute it and/or -modify it under the terms of the GNU Library General Public License as -published by the Free Software Foundation; either version 2 of the -License, or (at your option) any later version. - -The GNU C Library is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -Library General Public License for more details. - -You should have received a copy of the GNU Library General Public -License along with the GNU C Library; see the file COPYING.LIB. If -not, write to the Free Software Foundation, Inc., 675 Mass Ave, -Cambridge, MA 02139, USA. */ - -#include -#include - -#ifdef __GNUC__ - -PTR -DEFUN(memchr, (str, c, len), - CONST PTR str AND int c AND size_t len) -{ - PTR retval; - asm("cld\n" /* Search forward. */ - "testl %1,%1\n" /* Clear Z flag, to handle LEN == 0. */ - /* Some old versions of gas need `repne' instead of `repnz'. */ - "repnz\n" /* Search for C in al. */ - "scasb\n" - "movl %2,%0\n" /* Set %0 to 0 (without affecting Z flag). */ - "jnz done\n" /* Jump if we found nothing equal to C. */ - "leal -1(%1),%0\n" /* edi has been incremented. Return edi-1. */ - "done:" : - "=a" (retval), "=D" (str), "=c" (len) : - "0" (c), "1" (str), "2" (len)); - return retval; -} - -#else -#include -#endif diff --git a/sysdeps/i386/memcmp.S b/sysdeps/i386/memcmp.S new file mode 100644 index 0000000..f16b44a --- /dev/null +++ b/sysdeps/i386/memcmp.S @@ -0,0 +1,68 @@ +/* memcmp -- compare two memory blocks for differences in the first COUNT + bytes. +Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + block1 (sp + 4) + block2 (sp + 8) + len (sp + 12) +*/ + + .text +ENTRY (memcmp) + pushl %esi /* Save callee-safe registers. */ + movl %edi, %edx /* Note that %edx is not used and can + so be used to save %edi. It's faster. */ + + movl 12(%esp), %esi /* Load address of block #1. */ + movl 16(%esp), %edi /* Load address of block #2. */ + movl 20(%esp), %ecx /* Load maximal length of compare area. */ + + cld /* Set direction of comparison. */ + + xorl %eax, %eax /* Default result. */ + + repe /* Compare at most %ecx bytes. */ + cmpsb + jz L1 /* If even last byte was equal we return 0. */ + + /* The memory blocks are not equal. So result of the last + subtraction is present in the carry flag. It is set when + the byte in block #2 is bigger. In this case we have to + return -1 (=0xffffffff), else 1. */ + sbbl %eax, %eax /* This is tricky. %eax == 0 and carry is set + or not depending on last subtraction. */ + + /* At this point %eax == 0, if the byte of block #1 was bigger, and + 0xffffffff if the last byte of block #2 was bigger. The later + case is already correct but the former needs a little adjustment. + Note that the following operation does not change 0xffffffff. */ + orb $1, %al /* Change 0 to 1. */ + +L1: popl %esi /* Restore registers. */ + movl %edx, %edi + + ret + +#undef bcmp +weak_alias (memcmp, bcmp) diff --git a/sysdeps/i386/stpcpy.S b/sysdeps/i386/stpcpy.S new file mode 100644 index 0000000..f38a908 --- /dev/null +++ b/sysdeps/i386/stpcpy.S @@ -0,0 +1,87 @@ +/* stpcpy -- copy SRC to DEST returning the address of the terminating '\0' + in DEST. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper (drepper@gnu.ai.mit.edu). +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +/* This function is defined neither in ANSI nor POSIX standards but is + also not invented here. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + dest (sp + 4) + src (sp + 8) +*/ + + .text +ENTRY (__stpcpy) + movl 4(%esp), %eax /* load destination pointer */ + movl 8(%esp), %ecx /* load source pointer */ + + subl %eax, %ecx /* magic: reduce number of loop variants + to one using addressing mode */ + + /* Here we would like to write + + subl $4, %eax + ALIGN (4) + + but the assembler is too smart and optimizes for the shortest + form where the number only needs one byte. But if we could + have the long form we would not need the alignment. */ + + .byte 0x81, 0xe8 /* This is `subl $0x00000004, %eax' */ + .long 0x00000004 + + /* Four times unfolded loop with only one loop counter. This + is achieved by the use of index+base adressing mode. As the + loop counter we use the destination address because this is + also the result. */ +L1: addl $4, %eax /* increment loop counter */ + + movb (%eax,%ecx), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L2 /* yes, then exit */ + + movb 1(%eax,%ecx), %dl /* load current char */ + movb %dl, 1(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + movb 2(%eax,%ecx), %dl /* load current char */ + movb %dl, 2(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L4 /* yes, then exit */ + + movb 3(%eax,%ecx), %dl /* load current char */ + movb %dl, 3(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jnz L1 /* no, then continue loop */ + + incl %eax /* correct loop counter */ +L4: incl %eax +L3: incl %eax +L2: + ret + +weak_alias (__stpcpy, stpcpy) diff --git a/sysdeps/i386/stpncpy.S b/sysdeps/i386/stpncpy.S new file mode 100644 index 0000000..59192e6 --- /dev/null +++ b/sysdeps/i386/stpncpy.S @@ -0,0 +1,143 @@ +/* stpncpy -- copy no more then N bytes from SRC to DEST, returning the + address of the terminating '\0' in DEST. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Some bug fixes by Alan Modra + - original wrote n+1 chars in some cases. + - stpncpy() ought to behave like strncpy() ie. not null-terminate + if limited by n. glibc-1.09 stpncpy() does this. +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + dest (sp + 4) + src (sp + 8) + maxlen (sp + 12) +*/ + + .text +ENTRY (__stpncpy) + + pushl %esi + + movl 8(%esp), %eax /* load destination pointer */ + movl 12(%esp), %esi /* load source pointer */ + movl 16(%esp), %ecx /* load maximal length */ + + subl %eax, %esi /* magic: reduce number of loop variants + to one using addressing mode */ + jmp L1 /* jump to loop "head" */ + + ALIGN(4) + + /* Four times unfolded loop with two loop counters. We get the + the third value (the source address) by using the index+base + adressing mode. */ +L2: movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L7 /* yes, then exit */ + + movb 1(%eax,%esi), %dl /* load current char */ + movb %dl, 1(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L6 /* yes, then exit */ + + movb 2(%eax,%esi), %dl /* load current char */ + movb %dl, 2(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L5 /* yes, then exit */ + + movb 3(%eax,%esi), %dl /* load current char */ + movb %dl, 3(%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L4 /* yes, then exit */ + + addl $4, %eax /* increment loop counter for full round */ + +L1: subl $4, %ecx /* still more than 4 bytes allowed? */ + jae L2 /* yes, then go to start of loop */ + + /* The maximal remaining 15 bytes are not processed in a loop. */ + + addl $4, %ecx /* correct above subtraction */ + jz L9 /* maximal allowed char reached => go to end */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + incl %eax /* increment pointer */ + decl %ecx /* decrement length counter */ + jz L9 /* no more allowed => exit */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + incl %eax /* increment pointer */ + decl %ecx /* decrement length counter */ + jz L9 /* no more allowed => exit */ + + movb (%eax,%esi), %dl /* load current char */ + movb %dl, (%eax) /* and store it */ + testb %dl, %dl /* was it NUL? */ + jz L3 /* yes, then exit */ + + incl %eax /* increment pointer */ + jmp L9 /* we don't have to test for counter underflow + because we know we had a most 3 bytes + remaining => exit */ + + /* When coming from the main loop we have to adjust the pointer. */ +L4: decl %ecx /* decrement counter */ + incl %eax /* increment pointer */ + +L5: decl %ecx /* increment pointer */ + incl %eax /* increment pointer */ + +L6: decl %ecx /* increment pointer */ + incl %eax /* increment pointer */ +L7: + + addl $3, %ecx /* correct pre-decrementation of counter + at the beginning of the loop; but why 3 + and not 4? Very simple, we have to count + the NUL char we already wrote. */ + jz L9 /* counter is also 0 => exit */ + + /* We now have to fill the rest of the buffer with NUL. This + is done in a tricky way. Please note that the adressing mode + used below is not the same we used above. Here we use the + %ecx register. */ +L8: + movb $0, (%ecx,%eax) /* store NUL char */ +L3: decl %ecx /* all bytes written? */ + jnz L8 /* no, then again */ + +L9: popl %esi /* restore saved register content */ + + ret + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/i386/strchr.S b/sysdeps/i386/strchr.S new file mode 100644 index 0000000..de947cd --- /dev/null +++ b/sysdeps/i386/strchr.S @@ -0,0 +1,278 @@ +/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Some optimisations by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + ch (sp + 8) +*/ + + .text +ENTRY (strchr) + pushl %edi /* Save callee-safe registers used here. */ + + movl 8(%esp), %eax /* get string pointer */ + movl 12(%esp), %edx /* get character we are looking for */ + + /* At the moment %edx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %dl, %dh /* now it is 0|0|c|c */ + movl %edx, %ecx + shll $16, %edx /* now it is c|c|0|0 */ + movw %cx, %dx /* and finally c|c|c|c */ + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 32-bit memory access is faster + and (more important) + 2. we process in the main loop 32 bit in one step although + we don't know the end of the string. But accessing at + 4-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherant + boundaries are multiples of 4. */ + + testb $3, %eax /* correctly aligned ? */ + jz L11 /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L6 /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %eax /* correctly aligned ? */ + jz L11 /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L6 /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %eax /* increment pointer */ + + testb $3, %eax /* correctly aligned ? */ + jz L11 /* yes => begin loop */ + movb (%eax), %cl /* load byte in question (we need it twice) */ + cmpb %cl, %dl /* compare byte */ + je L6 /* target found => return */ + testb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %eax /* increment pointer */ + + /* No we have reached alignment. */ + jmp L11 /* begin loop */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + /* Each round the main loop processes 16 bytes. */ + + ALIGN(4) + +L1: addl $16, %eax /* adjust pointer for whole round */ + +L11: movl (%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + jnc L7 + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L7 /* found it => return pointer */ + + /* Now we made sure the dword does not contain the character we are + looking for. But because we deal with strings we have to check + for the end of string before testing the next dword. */ + + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L2 /* found NUL => return NULL */ + + movl 4(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L71 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L71 /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L2 /* found NUL => return NULL */ + + movl 8(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L72 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L72 /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L2 /* found NUL => return NULL */ + + movl 12(%eax), %ecx /* get word (= 4 bytes) in question */ + xorl %edx, %ecx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* C */ + jnc L73 /* highest byte is C => return pointer */ + xorl %ecx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L73 /* found it => return pointer */ + xorl %edx, %ecx /* restore original dword without reload */ + movl $0xfefefeff, %edi /* magic value */ + addl %ecx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L2 /* highest byte is NUL => return NULL */ + xorl %ecx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L1 /* no NUL found => restart loop */ + +L2: /* Return NULL. */ + xorl %eax, %eax /* load NULL in return value register */ + popl %edi /* restore saved register content */ + ret + +L73: addl $4, %eax /* adjust pointer */ +L72: addl $4, %eax +L71: addl $4, %eax + + /* We now scan for the byte in which the character was matched. + But we have to take care of the case that a NUL char is + found before this in the dword. */ + +L7: testb %cl, %cl /* is first byte C? */ + jz L6 /* yes => return pointer */ + cmpb %dl, %cl /* is first byte NUL? */ + je L2 /* yes => return NULL */ + incl %eax /* it's not in the first byte */ + + testb %ch, %ch /* is second byte C? */ + jz L6 /* yes => return pointer */ + cmpb %dl, %ch /* is second byte NUL? */ + je L2 /* yes => return NULL? */ + incl %eax /* it's not in the second byte */ + + shrl $16, %ecx /* make upper byte accessible */ + testb %cl, %cl /* is third byte C? */ + jz L6 /* yes => return pointer */ + cmpb %dl, %cl /* is third byte NUL? */ + je L2 /* yes => return NULL */ + + /* It must be in the fourth byte and it cannot be NUL. */ + incl %eax + +L6: popl %edi /* restore saved register content */ + + ret + +weak_alias (strchr, index) diff --git a/sysdeps/i386/strcspn.S b/sysdeps/i386/strcspn.S new file mode 100644 index 0000000..b0e789b --- /dev/null +++ b/sysdeps/i386/strcspn.S @@ -0,0 +1,176 @@ +/* strcspn (str, ss) -- Return the length of the initial segement of STR + which contains no characters from SS. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Bug fixes by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + stopset (sp + 8) +*/ + + .text +ENTRY (strcspn) + movl 4(%esp), %edx /* get string pointer */ + movl 8(%esp), %eax /* get stopset pointer */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl $0 /* These immediate values make the label 2 */ + pushl $0 /* to be aligned on a 16 byte boundary to */ + pushl $0 /* get a better performance of the loop. */ + pushl $0 + pushl $0 + pushl $0 + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L2: movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L2 /* no => process next dword from stopset */ + +L1: leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the chracter is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L3: addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L4 /* yes => return */ + + movb 1(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L5 /* yes => return */ + + movb 2(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L6 /* yes => return */ + + movb 3(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + jne L3 /* yes => return */ + + incl %eax /* adjust pointer */ +L6: incl %eax +L5: incl %eax + +L4: subl %edx, %eax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + addl $256, %esp /* remove stopset */ + + ret diff --git a/sysdeps/i386/strpbrk.S b/sysdeps/i386/strpbrk.S new file mode 100644 index 0000000..245bf1a --- /dev/null +++ b/sysdeps/i386/strpbrk.S @@ -0,0 +1,177 @@ +/* strcspn (str, ss) -- Return the length of the initial segement of STR + which contains no characters from SS. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Bug fixes by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + stopset (sp + 8) +*/ + + .text +ENTRY (strpbrk) + movl 4(%esp), %edx /* get string pointer */ + movl 8(%esp), %eax /* get stopset pointer */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl $0 /* These immediate values make the label 2 */ + pushl $0 /* to be aligned on a 16 byte boundary to */ + pushl $0 /* get a better performance of the loop. */ + pushl $0 + pushl $0 + pushl $0 + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L2: movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L2 /* no => process next dword from stopset */ + +L1: leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the chracter is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L3: addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L4 /* yes => return */ + + movb 1(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L5 /* yes => return */ + + movb 2(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + je L6 /* yes => return */ + + movb 3(%eax), %cl /* get byte from string */ + cmpb %cl, (%esp,%ecx) /* is it contained in stopset? */ + jne L3 /* yes => return */ + + incl %eax /* adjust pointer */ +L6: incl %eax +L5: incl %eax + +L4: addl $256, %esp /* remove stopset */ + + orb %cl, %cl /* was last character NUL? */ + jnz L7 /* no => return pointer */ + xorl %eax, %eax /* return NULL */ + +L7: ret diff --git a/sysdeps/i386/strrchr.S b/sysdeps/i386/strrchr.S new file mode 100644 index 0000000..468a940 --- /dev/null +++ b/sysdeps/i386/strrchr.S @@ -0,0 +1,321 @@ +/* strchr (str, ch) -- Return pointer to last occurrence of CH in STR. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Some optimisations by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + ch (sp + 8) +*/ + + .text +ENTRY (strrchr) + pushl %edi /* Save callee-safe registers used here. */ + pushl %esi + + xorl %eax, %eax + movl 12(%esp), %esi /* get string pointer */ + movl 16(%esp), %ecx /* get character we are looking for */ + + /* At the moment %ecx contains C. What we need for the + algorithm is C in all bytes of the dword. Avoid + operations on 16 bit words because these require an + prefix byte (and one more cycle). */ + movb %cl, %ch /* now it is 0|0|c|c */ + movl %ecx, %edx + shll $16, %ecx /* now it is c|c|0|0 */ + movw %dx, %cx /* and finally c|c|c|c */ + + /* Before we start with the main loop we process single bytes + until the source pointer is aligned. This has two reasons: + 1. aligned 32-bit memory access is faster + and (more important) + 2. we process in the main loop 32 bit in one step although + we don't know the end of the string. But accessing at + 4-byte alignment guarantees that we never access illegal + memory if this would not also be done by the trivial + implementation (this is because all processor inherant + boundaries are multiples of 4. */ + + testb $3, %esi /* correctly aligned ? */ + jz L19 /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L11 /* target found => return */ + movl %esi, %eax /* remember pointer as possible result */ +L11: orb %dl, %dl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %esi /* increment pointer */ + + testb $3, %esi /* correctly aligned ? */ + jz L19 /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L12 /* target found => return */ + movl %esi, %eax /* remember pointer as result */ +L12: orb %dl, %dl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %esi /* increment pointer */ + + testb $3, %esi /* correctly aligned ? */ + jz L19 /* yes => begin loop */ + movb (%esi), %dl /* load byte in question (we need it twice) */ + cmpb %dl, %cl /* compare byte */ + jne L13 /* target found => return */ + movl %esi, %eax /* remember pointer as result */ +L13: orb %cl, %cl /* is NUL? */ + jz L2 /* yes => return NULL */ + incl %esi /* increment pointer */ + + /* No we have reached alignment. */ + jmp L19 /* begin loop */ + + /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to + change any of the hole bits of LONGWORD. + + 1) Is this safe? Will it catch all the zero bytes? + Suppose there is a byte with all zeros. Any carry bits + propagating from its left will fall into the hole at its + least significant bit and stop. Since there will be no + carry from its most significant bit, the LSB of the + byte to the left will be unchanged, and the zero will be + detected. + + 2) Is this worthwhile? Will it ignore everything except + zero bytes? Suppose every byte of LONGWORD has a bit set + somewhere. There will be a carry into bit 8. If bit 8 + is set, this will carry into bit 16. If bit 8 is clear, + one of bits 9-15 must be set, so there will be a carry + into bit 16. Similarly, there will be a carry into bit + 24. If one of bits 24-31 is set, there will be a carry + into bit 32 (=carry flag), so all of the hole bits will + be changed. + + 3) But wait! Aren't we looking for C, not zero? + Good point. So what we do is XOR LONGWORD with a longword, + each of whose bytes is C. This turns each byte that is C + into a zero. */ + + /* Each round the main loop processes 16 bytes. */ + + /* Jump to here when the character is detected. We chose this + way around because the character one is looking for is not + as frequent as the rest and taking a conditional jump is more + expensive than ignoring it. + + Some more words to the code below: it might not be obvious why + we decrement the source pointer here. In the loop the pointer + is not pre-incremented and so it still points before the word + we are looking at. But you should take a look at the instruction + which gets executed before we get into the loop: `addl $16, %esi'. + This makes the following subs into adds. */ + + /* These fill bytes make the main loop be correctly aligned. + We cannot use align because it is not the following instruction + which should be aligned. */ + .byte 0, 0, 0, 0, 0, 0, 0, 0 + +L4: subl $4, %esi /* adjust pointer */ +L41: subl $4, %esi +L42: subl $4, %esi +L43: testl $0xff000000, %edx /* is highest byte == C? */ + jnz L33 /* no => try other bytes */ + leal 15(%esi), %eax /* store address as result */ + jmp L1 /* and start loop again */ + +L3: subl $4, %esi /* adjust pointer */ +L31: subl $4, %esi +L32: subl $4, %esi +L33: testl $0xff0000, %edx /* is C in third byte? */ + jnz L51 /* no => try other bytes */ + leal 14(%esi), %eax /* store address as result */ + jmp L1 /* and start loop again */ + +L51: + /* At this point we know that the byte is in one of the lower bytes. + We make a guess and correct it if necessary. This reduces the + number of necessary jumps. */ + leal 12(%esi), %eax /* guess address of lowest byte as result */ + testb %dh, %dh /* is guess correct? */ + jnz L1 /* yes => start loop */ + leal 13(%esi), %eax /* correct guess to second byte */ + +L1: addl $16, %esi /* increment pointer for full round */ + +L19: movl (%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + + /* According to the algorithm we had to reverse the effect of the + XOR first and then test the overflow bits. But because the + following XOR would destroy the carry flag and it would (in a + representation with more than 32 bits) not alter then last + overflow, we can now test this condition. If no carry is signaled + no overflow must have occured in the last byte => it was 0. */ + + jnc L20 /* found NUL => check last word */ + + /* We are only interested in carry bits that change due to the + previous add, so remove original bits */ + xorl %edx, %edi /* (word+magic)^word */ + + /* Now test for the other three overflow bits. */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + + /* If at least one byte of the word is C we don't get 0 in %edi. */ + jnz L20 /* found NUL => check last word */ + + /* Now we made sure the dword does not contain the character we are + looking for. But because we deal with strings we have to check + for the end of string before testing the next dword. */ + + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L4 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L3 /* C is detected in the word => examine it */ + + movl 4(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L21 /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L21 /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L41 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L31 /* C is detected in the word => examine it */ + + movl 8(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L22 /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L22 /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L42 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L32 /* C is detected in the word => examine it */ + + movl 12(%esi), %edx /* get word (= 4 bytes) in question */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L23 /* found NUL => check last word */ + xorl %edx, %edi /* (word+magic)^word */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jnz L23 /* found NUL => check last word */ + xorl %ecx, %edx /* XOR with word c|c|c|c => bytes of str == c + are now 0 */ + movl $0xfefefeff, %edi /* magic value */ + addl %edx, %edi /* add the magic value to the word. We get + carry bits reported for each byte which + is *not* 0 */ + jnc L43 /* highest byte is C => examine dword */ + xorl %edx, %edi /* ((word^charmask)+magic)^(word^charmask) */ + orl $0xfefefeff, %edi /* set all non-carry bits */ + incl %edi /* add 1: if one carry bit was *not* set + the addition will not result in 0. */ + jz L1 /* C is not detected => restart loop */ + jmp L33 /* examine word */ + +L23: addl $4, %esi /* adjust pointer */ +L22: addl $4, %esi +L21: addl $4, %esi + + /* What remains to do is to test which byte the NUL char is and + whether the searched character appears in one of the bytes + before. A special case is that the searched byte maybe NUL. + In this case a pointer to the terminating NUL char has to be + returned. */ + +L20: cmpb %cl, %dl /* is first byte == C? */ + jne L24 /* no => skip */ + movl %esi, %eax /* store address as result */ +L24: testb %dl, %dl /* is first byte == NUL? */ + jz L2 /* yes => return */ + + cmpb %cl, %dh /* is second byte == C? */ + jne L25 /* no => skip */ + leal 1(%esi), %eax /* store address as result */ +L25: testb %dh, %dh /* is second byte == NUL? */ + jz L2 /* yes => return */ + + shrl $16,%edx /* make upper bytes accessible */ + cmpb %cl, %dl /* is third byte == C */ + jne L26 /* no => skip */ + leal 2(%esi), %eax /* store address as result */ +L26: testb %dl, %dl /* is third byte == NUL */ + jz L2 /* yes => return */ + + cmpb %cl, %dh /* is fourth byte == C */ + jne L2 /* no => skip */ + leal 3(%esi), %eax /* store address as result */ + +L2: popl %esi /* restore saved register content */ + popl %edi + + ret + +weak_alias (strrchr, rindex) diff --git a/sysdeps/i386/strspn.S b/sysdeps/i386/strspn.S new file mode 100644 index 0000000..1a02026 --- /dev/null +++ b/sysdeps/i386/strspn.S @@ -0,0 +1,176 @@ +/* strcspn (str, ss) -- Return the length of the initial segement of STR + which contains only characters from SS. +For Intel 80x86, x>=3. +Copyright (C) 1994, 1995 Free Software Foundation, Inc. +Contributed by Ulrich Drepper +Bug fixes by Alan Modra +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + +#include +#include "asm-syntax.h" + +/* + INPUT PARAMETERS: + str (sp + 4) + skipset (sp + 8) +*/ + + .text +ENTRY (strspn) + movl 4(%esp), %edx /* get string pointer */ + movl 8(%esp), %eax /* get skipset pointer */ + + /* First we create a table with flags for all possible characters. + For the ASCII (7bit/8bit) or ISO-8859-X character sets which are + supported by the C string functions we have 256 characters. + Before inserting marks for the stop characters we clear the whole + table. The unrolled form is much faster than a loop. */ + xorl %ecx, %ecx /* %ecx = 0 !!! */ + + pushl %ecx /* make a 256 bytes long block filled with 0 */ + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl %ecx + pushl $0 /* These immediate values make the label 2 */ + pushl $0 /* to be aligned on a 16 byte boundary to */ + pushl $0 /* get a better performance of the loop. */ + pushl $0 + pushl $0 + pushl $0 + +/* For understanding the following code remember that %ecx == 0 now. + Although all the following instruction only modify %cl we always + have a correct zero-extended 32-bit value in %ecx. */ + +/* Don't change the "testb $0xff,%%cl" to "testb %%cl,%%cl". We want + longer instructions so that the next loop aligns without adding nops. */ + +L2: movb (%eax), %cl /* get byte from stopset */ + testb %cl, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 1(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 2(%eax), %cl /* get byte from stopset */ + testb $0xff, %cl /* is NUL char? */ + jz L1 /* yes => start compare loop */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + + movb 3(%eax), %cl /* get byte from stopset */ + addl $4, %eax /* increment stopset pointer */ + movb %cl, (%esp,%ecx) /* set corresponding byte in stopset table */ + testb $0xff, %cl /* is NUL char? */ + jnz L2 /* no => process next dword from stopset */ + +L1: leal -4(%edx), %eax /* prepare loop */ + + /* We use a neat trick for the following loop. Normally we would + have to test for two termination conditions + 1. a character in the stopset was found + and + 2. the end of the string was found + But as a sign that the chracter is in the stopset we store its + value in the table. But the value of NUL is NUL so the loop + terminates for NUL in every case. */ + +L3: addl $4, %eax /* adjust pointer for full loop round */ + + movb (%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L4 /* no => return */ + + movb 1(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L5 /* no => return */ + + movb 2(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jz L6 /* no => return */ + + movb 3(%eax), %cl /* get byte from string */ + testb %cl, (%esp,%ecx) /* is it contained in skipset? */ + jnz L3 /* yes => start loop again */ + + incl %eax /* adjust pointer */ +L6: incl %eax +L5: incl %eax + +L4: subl %edx, %eax /* we have to return the number of valid + characters, so compute distance to first + non-valid character */ + addl $256, %esp /* remove stopset */ + + ret diff --git a/sysdeps/i386/sub_n.S b/sysdeps/i386/sub_n.S index 64d2c25..e18a708 100644 --- a/sysdeps/i386/sub_n.S +++ b/sysdeps/i386/sub_n.S @@ -1,7 +1,7 @@ /* i80386 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store sum in a third limb vector. -Copyright (C) 1992, 1994 Free Software Foundation, Inc. +Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU MP Library. @@ -37,10 +37,10 @@ C_SYMBOL_NAME(__mpn_sub_n:) pushl %edi pushl %esi - movl 12(%esp),%edi /* res_ptr */ - movl 16(%esp),%esi /* s1_ptr */ - movl 20(%esp),%edx /* s2_ptr */ - movl 24(%esp),%ecx /* size */ + movl 12(%esp),%edi /* res_ptr */ + movl 16(%esp),%esi /* s1_ptr */ + movl 20(%esp),%edx /* s2_ptr */ + movl 24(%esp),%ecx /* size */ movl %ecx,%eax shrl $3,%ecx /* compute count for unrolled loop */ @@ -54,14 +54,18 @@ C_SYMBOL_NAME(__mpn_sub_n:) subl %eax,%edx /* ... enter the loop */ shrl $2,%eax /* restore previous value */ #ifdef PIC - call here -here: leal (Loop - 3 - here)(%eax,%eax,8),%eax - addl %eax,(%esp) - ret +/* Calculate start address in loop for PIC. Due to limitations in some + assemblers, Loop-L0-3 cannot be put into the leal */ + call L0 +L0: leal (%eax,%eax,8),%eax + addl (%esp),%eax + addl $(Loop-L0-3),%eax + addl $4,%esp #else - leal (Loop - 3)(%eax,%eax,8),%eax /* calc start addr in loop */ - jmp *%eax /* jump into loop */ +/* Calculate start address in loop for non-PIC. */ + leal (Loop - 3)(%eax,%eax,8),%eax #endif + jmp *%eax /* jump into loop */ ALIGN (3) Loop: movl (%esi),%eax sbbl (%edx),%eax diff --git a/sysdeps/i960/add_n.s b/sysdeps/i960/add_n.s new file mode 100644 index 0000000..6031f6d --- /dev/null +++ b/sysdeps/i960/add_n.s @@ -0,0 +1,21 @@ +.text + .align 4 + .globl ___mpn_add_n +___mpn_add_n: + mov 0,g6 # clear carry-save register + cmpo 1,0 # clear cy + +Loop: subo 1,g3,g3 # update loop counter + ld (g1),g5 # load from s1_ptr + addo 4,g1,g1 # s1_ptr++ + ld (g2),g4 # load from s2_ptr + addo 4,g2,g2 # s2_ptr++ + cmpo g6,1 # restore cy from g6, relies on cy being 0 + addc g4,g5,g4 # main add + subc 0,0,g6 # save cy in g6 + st g4,(g0) # store result to res_ptr + addo 4,g0,g0 # res_ptr++ + cmpobne 0,g3,Loop # when branch is taken, clears C bit + + mov g6,g0 + ret diff --git a/sysdeps/i960/addmul_1.s b/sysdeps/i960/addmul_1.s new file mode 100644 index 0000000..1a3de95 --- /dev/null +++ b/sysdeps/i960/addmul_1.s @@ -0,0 +1,26 @@ +.text + .align 4 + .globl ___mpn_mul_1 +___mpn_mul_1: + subo g2,0,g2 + shlo 2,g2,g4 + subo g4,g1,g1 + subo g4,g0,g13 + mov 0,g0 + + cmpo 1,0 # clear C bit on AC.cc + +Loop: ld (g1)[g2*4],g5 + emul g3,g5,g6 + ld (g13)[g2*4],g5 + + addc g0,g6,g6 # relies on that C bit is clear + addc 0,g7,g7 + addc g5,g6,g6 # relies on that C bit is clear + st g6,(g13)[g2*4] + addc 0,g7,g0 + + addo g2,1,g2 + cmpobne 0,g2,Loop # when branch is taken, clears C bit + + ret diff --git a/sysdeps/i960/mul_1.s b/sysdeps/i960/mul_1.s new file mode 100644 index 0000000..e75ea42 --- /dev/null +++ b/sysdeps/i960/mul_1.s @@ -0,0 +1,23 @@ +.text + .align 4 + .globl ___mpn_mul_1 +___mpn_mul_1: + subo g2,0,g2 + shlo 2,g2,g4 + subo g4,g1,g1 + subo g4,g0,g13 + mov 0,g0 + + cmpo 1,0 # clear C bit on AC.cc + +Loop: ld (g1)[g2*4],g5 + emul g3,g5,g6 + + addc g0,g6,g6 # relies on that C bit is clear + st g6,(g13)[g2*4] + addc 0,g7,g0 + + addo g2,1,g2 + cmpobne 0,g2,Loop # when branch is taken, clears C bit + + ret diff --git a/sysdeps/i960/sub_n.s b/sysdeps/i960/sub_n.s new file mode 100644 index 0000000..13ebbfa --- /dev/null +++ b/sysdeps/i960/sub_n.s @@ -0,0 +1,21 @@ +.text + .align 4 + .globl ___mpn_sub_n +___mpn_sub_n: + mov 1,g6 # set carry-save register + cmpo 1,0 # clear cy + +Loop: subo 1,g3,g3 # update loop counter + ld (g1),g5 # load from s1_ptr + addo 4,g1,g1 # s1_ptr++ + ld (g2),g4 # load from s2_ptr + addo 4,g2,g2 # s2_ptr++ + cmpo g6,1 # restore cy from g6, relies on cy being 0 + subc g4,g5,g4 # main subtract + subc 0,0,g6 # save cy in g6 + st g4,(g0) # store result to res_ptr + addo 4,g0,g0 # res_ptr++ + cmpobne 0,g3,Loop # when branch is taken, cy will be 0 + + mov g6,g0 + ret diff --git a/sysdeps/m88k/m88100/add_n.s b/sysdeps/m88k/m88100/add_n.s new file mode 100644 index 0000000..7e4cccc --- /dev/null +++ b/sysdeps/m88k/m88100/add_n.s @@ -0,0 +1,103 @@ +; mc88100 __mpn_add -- Add two limb vectors of the same length > 0 and store +; sum in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___mpn_add_n +___mpn_add_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu.co r5,r0,r5 ; (clear carry as side effect) + mak r5,r5,3<4> + bcnd eq0,r5,Lzero + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; add 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + addu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; add 7 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; add 6 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; add 5 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; add 4 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; add 3 + 8r limbs + addu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; add 2 + 8r limbs + addu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; add 1 + 8r limbs + addu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + jmp.n r1 + addu.ci r2,r0,r0 ; return carry-out from most sign. limb diff --git a/sysdeps/m88k/m88100/mul_1.s b/sysdeps/m88k/m88100/mul_1.s new file mode 100644 index 0000000..35c238d --- /dev/null +++ b/sysdeps/m88k/m88100/mul_1.s @@ -0,0 +1,128 @@ +; mc88100 __mpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + +; Common overhead is about 11 cycles/invocation. + +; The speed for S2_LIMB >= 0x10000 is approximately 21 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention.) + +; The speed for S2_LIMB < 0x10000 is approximately 16 cycles/limb. (The +; pipeline stalls 2 cycles due to WB contention and 1 cycle due to latency.) + +; To enhance speed: +; 1. Unroll main loop 4-8 times. +; 2. Schedule code to avoid WB contention. It might be tempting to move the +; ld instruction in the loops down to save 2 cycles (less WB contention), +; but that looses because the ultimate value will be read from outside +; the allocated space. But if we handle the ultimate multiplication in +; the tail, we can do this. +; 3. Make the multiplication with less instructions. I think the code for +; (S2_LIMB >= 0x10000) is not minimal. +; With these techniques the (S2_LIMB >= 0x10000) case would run in 17 or +; less cycles/limb; the (S2_LIMB < 0x10000) case would run in 11 +; cycles/limb. (Assuming infinite unrolling.) + + text + align 16 + global ___mpn_mul_1 +___mpn_mul_1: + + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r6,r2[r4] ; RES_PTR in r6 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + ld r9,r3[r4] + mask r7,r5,0xffff ; r7 = lo(S2_LIMB) + extu r8,r5,16 ; r8 = hi(S2_LIMB) + bcnd.n eq0,r8,Lsmall ; jump if (hi(S2_LIMB) == 0) + subu r6,r6,4 + +; General code for any value of S2_LIMB. + + ; Make a stack frame and save r25 and r26 + subu r31,r31,16 + st.d r25,r31,8 + + ; Enter the loop in the middle + br.n L1 + addu r4,r4,1 + +Loop: + ld r9,r3[r4] + st r26,r6[r4] +; bcnd ne0,r0,0 ; bubble + addu r4,r4,1 +L1: mul r26,r9,r5 ; low word of product mul_1 WB ld + mask r12,r9,0xffff ; r12 = lo(s1_limb) mask_1 + mul r11,r12,r7 ; r11 = prod_0 mul_2 WB mask_1 + mul r10,r12,r8 ; r10 = prod_1a mul_3 + extu r13,r9,16 ; r13 = hi(s1_limb) extu_1 WB mul_1 + mul r12,r13,r7 ; r12 = prod_1b mul_4 WB extu_1 + mul r25,r13,r8 ; r25 = prod_2 mul_5 WB mul_2 + extu r11,r11,16 ; r11 = hi(prod_0) extu_2 WB mul_3 + addu r10,r10,r11 ; addu_1 WB extu_2 +; bcnd ne0,r0,0 ; bubble WB addu_1 + addu.co r10,r10,r12 ; WB mul_4 + mask.u r10,r10,0xffff ; move the 16 most significant bits... + addu.ci r10,r10,r0 ; ...to the low half of the word... + rot r10,r10,16 ; ...and put carry in pos 16. + addu.co r26,r26,r2 ; add old carry limb + bcnd.n ne0,r4,Loop + addu.ci r2,r25,r10 ; compute new carry limb + + st r26,r6[r4] + ld.d r25,r31,8 + jmp.n r1 + addu r31,r31,16 + +; Fast code for S2_LIMB < 0x10000 +Lsmall: + ; Enter the loop in the middle + br.n SL1 + addu r4,r4,1 + +SLoop: + ld r9,r3[r4] ; + st r8,r6[r4] ; + addu r4,r4,1 ; +SL1: mul r8,r9,r5 ; low word of product + mask r12,r9,0xffff ; r12 = lo(s1_limb) + extu r13,r9,16 ; r13 = hi(s1_limb) + mul r11,r12,r7 ; r11 = prod_0 + mul r12,r13,r7 ; r12 = prod_1b + addu.cio r8,r8,r2 ; add old carry limb + extu r10,r11,16 ; r11 = hi(prod_0) + addu r10,r10,r12 ; + bcnd.n ne0,r4,SLoop + extu r2,r10,16 ; r2 = new carry limb + + jmp.n r1 + st r8,r6[r4] diff --git a/sysdeps/m88k/m88100/sub_n.s b/sysdeps/m88k/m88100/sub_n.s new file mode 100644 index 0000000..3963cd5 --- /dev/null +++ b/sysdeps/m88k/m88100/sub_n.s @@ -0,0 +1,104 @@ +; mc88100 __mpn_sub -- Subtract two limb vectors of the same length > 0 and +; store difference in a third limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; s2_ptr r4 +; size r5 + +; This code has been optimized to run one instruction per clock, avoiding +; load stalls and writeback contention. As a result, the instruction +; order is not always natural. + +; The speed is about 4.6 clocks/limb + 18 clocks/limb-vector on an 88100, +; but on the 88110, it seems to run much slower, 6.6 clocks/limb. + + text + align 16 + global ___mpn_sub_n +___mpn_sub_n: + ld r6,r3,0 ; read first limb from s1_ptr + extu r10,r5,3 + ld r7,r4,0 ; read first limb from s2_ptr + + subu.co r5,r0,r5 ; (clear carry as side effect) + mak r5,r5,3<4> + bcnd eq0,r5,Lzero + + or r12,r0,lo16(Lbase) + or.u r12,r12,hi16(Lbase) + addu r12,r12,r5 ; r12 is address for entering in loop + + extu r5,r5,2 ; divide by 4 + subu r2,r2,r5 ; adjust res_ptr + subu r3,r3,r5 ; adjust s1_ptr + subu r4,r4,r5 ; adjust s2_ptr + + or r8,r6,r0 + + jmp.n r12 + or r9,r7,r0 + +Loop: addu r3,r3,32 + st r8,r2,28 + addu r4,r4,32 + ld r6,r3,0 + addu r2,r2,32 + ld r7,r4,0 +Lzero: subu r10,r10,1 ; subtract 0 + 8r limbs (adj loop cnt) +Lbase: ld r8,r3,4 + subu.cio r6,r6,r7 + ld r9,r4,4 + st r6,r2,0 + ld r6,r3,8 ; subtract 7 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,8 + st r8,r2,4 + ld r8,r3,12 ; subtract 6 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,12 + st r6,r2,8 + ld r6,r3,16 ; subtract 5 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,16 + st r8,r2,12 + ld r8,r3,20 ; subtract 4 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,20 + st r6,r2,16 + ld r6,r3,24 ; subtract 3 + 8r limbs + subu.cio r8,r8,r9 + ld r7,r4,24 + st r8,r2,20 + ld r8,r3,28 ; subtract 2 + 8r limbs + subu.cio r6,r6,r7 + ld r9,r4,28 + st r6,r2,24 + bcnd.n ne0,r10,Loop ; subtract 1 + 8r limbs + subu.cio r8,r8,r9 + + st r8,r2,28 ; store most significant limb + + addu.ci r2,r0,r0 ; return carry-out from most sign. limb + jmp.n r1 + xor r2,r2,1 diff --git a/sysdeps/m88k/m88110/mul_1.s b/sysdeps/m88k/m88110/mul_1.s new file mode 100644 index 0000000..08c3ca0 --- /dev/null +++ b/sysdeps/m88k/m88110/mul_1.s @@ -0,0 +1,84 @@ +; mc88110 __mpn_mul_1 -- Multiply a limb vector with a single limb and +; store the product in a second limb vector. + +; Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +; This file is part of the GNU MP Library. + +; The GNU MP Library is free software; you can redistribute it and/or modify +; it under the terms of the GNU Library General Public License as published by +; the Free Software Foundation; either version 2 of the License, or (at your +; option) any later version. + +; The GNU MP Library is distributed in the hope that it will be useful, but +; WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +; License for more details. + +; You should have received a copy of the GNU Library General Public License +; along with the GNU MP Library; see the file COPYING.LIB. If not, write to +; the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +; INPUT PARAMETERS +; res_ptr r2 +; s1_ptr r3 +; size r4 +; s2_limb r5 + + text + align 16 + global ___mpn_mul_1 +___mpn_mul_1: + ; Make S1_PTR and RES_PTR point at the end of their blocks + ; and negate SIZE. + lda r3,r3[r4] + lda r8,r2[r4] ; RES_PTR in r8 since r2 is retval + subu r4,r0,r4 + + addu.co r2,r0,r0 ; r2 = cy = 0 + + ld r6,r3[r4] + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n eq0,r4,Lend + subu r8,r8,8 + +Loop: ld r6,r3[r4] + addu.cio r9,r11,r2 + or r2,r10,r0 ; could be avoided if unrolled + addu r4,r4,1 + mulu.d r10,r6,r5 + bcnd.n ne0,r4,Loop + st r9,r8[r4] + +Lend: addu.cio r9,r11,r2 + st r9,r8,4 + jmp.n r1 + addu.ci r2,r10,r0 + +; This is the Right Way to do this on '110. 4 cycles / 64-bit limb. +; ld.d r10, +; mulu.d +; addu.cio +; addu.cio +; st.d +; mulu.d ,r11,r5 +; ld.d r12, +; mulu.d ,r10,r5 +; addu.cio +; addu.cio +; st.d +; mulu.d +; ld.d r10, +; mulu.d +; addu.cio +; addu.cio +; st.d +; mulu.d +; ld.d r10, +; mulu.d +; addu.cio +; addu.cio +; st.d +; mulu.d diff --git a/sysdeps/mips/add_n.s b/sysdeps/mips/add_n.s new file mode 100644 index 0000000..c829108 --- /dev/null +++ b/sysdeps/mips/add_n.s @@ -0,0 +1,119 @@ + # MIPS2 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .set noreorder + .set nomacro + + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + addu $13,$12,$13 + sltu $2,$13,$12 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + addu $11,$10,$11 + sltu $2,$11,$10 + sw $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_add_n diff --git a/sysdeps/mips/addmul_1.s b/sysdeps/mips/addmul_1.s new file mode 100644 index 0000000..abc2fb8 --- /dev/null +++ b/sysdeps/mips/addmul_1.s @@ -0,0 +1,96 @@ + # MIPS __mpn_addmul_1 -- Multiply a limb vector with a single limb and + # add the product to a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop # should be "bnel" + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + addu $3,$10,$3 + sltu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_addmul_1 diff --git a/sysdeps/mips/lshift.s b/sysdeps/mips/lshift.s new file mode 100644 index 0000000..ce33e7c --- /dev/null +++ b/sysdeps/mips/lshift.s @@ -0,0 +1,94 @@ + # MIPS2 __mpn_lshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .set noreorder + .set nomacro + + sll $2,$6,2 + addu $5,$5,$2 # make r5 point at end of src + lw $10,-4($5) # load first limb + subu $13,$0,$7 + addu $4,$4,$2 # make r4 point at end of res + addiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + srl $2,$10,$13 # compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,-8($5) + addiu $4,$4,-4 + addiu $5,$5,-4 + addiu $9,$9,-1 + sll $11,$10,$7 + srl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,-8($5) + addiu $4,$4,-16 + addiu $6,$6,-4 + sll $11,$10,$7 + srl $12,$3,$13 + + lw $10,-12($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,12($4) + srl $9,$10,$13 + + lw $3,-16($5) + sll $11,$10,$7 + or $8,$14,$9 + sw $8,8($4) + srl $12,$3,$13 + + lw $10,-20($5) + sll $14,$3,$7 + or $8,$11,$12 + sw $8,4($4) + srl $9,$10,$13 + + addiu $5,$5,-16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,0($4) + +.Lend: sll $8,$10,$7 + j $31 + sw $8,-4($4) + .end __mpn_lshift diff --git a/sysdeps/mips/mips3/add_n.s b/sysdeps/mips/mips3/add_n.s new file mode 100644 index 0000000..b525780 --- /dev/null +++ b/sysdeps/mips/mips3/add_n.s @@ -0,0 +1,119 @@ + # MIPS3 __mpn_add_n -- Add two limb vectors of the same length > 0 and + # store sum in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_add_n + .ent __mpn_add_n +__mpn_add_n: + .set noreorder + .set nomacro + + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + or $2,$2,$8 + + ld $10,16($5) + daddu $13,$13,$2 + ld $11,16($6) + sltu $8,$13,$2 + daddu $13,$12,$13 + sltu $2,$13,$12 + sd $13,8($4) + or $2,$2,$8 + + ld $12,24($5) + daddu $11,$11,$2 + ld $13,24($6) + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,16($4) + or $2,$2,$8 + + ld $10,32($5) + daddu $13,$13,$2 + ld $11,32($6) + sltu $8,$13,$2 + daddu $13,$12,$13 + sltu $2,$13,$12 + sd $13,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + daddu $11,$10,$11 + sltu $2,$11,$10 + sd $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_add_n diff --git a/sysdeps/mips/mips3/addmul_1.s b/sysdeps/mips/mips3/addmul_1.s new file mode 100644 index 0000000..7af0172 --- /dev/null +++ b/sysdeps/mips/mips3/addmul_1.s @@ -0,0 +1,96 @@ + # MIPS3 __mpn_addmul_1 -- Multiply a limb vector with a single limb and + # add the product to a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_addmul_1 + .ent __mpn_addmul_1 +__mpn_addmul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop # should be "bnel" + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + daddu $3,$10,$3 + sltu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_addmul_1 diff --git a/sysdeps/mips/mips3/gmp-mparam.h b/sysdeps/mips/mips3/gmp-mparam.h new file mode 100644 index 0000000..a801b35 --- /dev/null +++ b/sysdeps/mips/mips3/gmp-mparam.h @@ -0,0 +1,26 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#define BITS_PER_MP_LIMB 64 +#define BYTES_PER_MP_LIMB 8 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 diff --git a/sysdeps/mips/mips3/lshift.s b/sysdeps/mips/mips3/lshift.s new file mode 100644 index 0000000..c05dcaf --- /dev/null +++ b/sysdeps/mips/mips3/lshift.s @@ -0,0 +1,94 @@ + # MIPS3 __mpn_lshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_lshift + .ent __mpn_lshift +__mpn_lshift: + .set noreorder + .set nomacro + + dsll $2,$6,3 + daddu $5,$5,$2 # make r5 point at end of src + ld $10,-8($5) # load first limb + dsubu $13,$0,$7 + daddu $4,$4,$2 # make r4 point at end of res + daddiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + dsrl $2,$10,$13 # compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,-16($5) + daddiu $4,$4,-8 + daddiu $5,$5,-8 + daddiu $9,$9,-1 + dsll $11,$10,$7 + dsrl $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,0($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,-16($5) + daddiu $4,$4,-32 + daddiu $6,$6,-4 + dsll $11,$10,$7 + dsrl $12,$3,$13 + + ld $10,-24($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,24($4) + dsrl $9,$10,$13 + + ld $3,-32($5) + dsll $11,$10,$7 + or $8,$14,$9 + sd $8,16($4) + dsrl $12,$3,$13 + + ld $10,-40($5) + dsll $14,$3,$7 + or $8,$11,$12 + sd $8,8($4) + dsrl $9,$10,$13 + + daddiu $5,$5,-32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,0($4) + +.Lend: dsll $8,$10,$7 + j $31 + sd $8,-8($4) + .end __mpn_lshift diff --git a/sysdeps/mips/mips3/mul_1.s b/sysdeps/mips/mips3/mul_1.s new file mode 100644 index 0000000..87954e5 --- /dev/null +++ b/sysdeps/mips/mips3/mul_1.s @@ -0,0 +1,84 @@ + # MIPS3 __mpn_mul_1 -- Multiply a limb vector with a single limb and + # store the product in a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_mul_1 + .ent __mpn_mul_1 +__mpn_mul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: mflo $10 + mfhi $9 + daddiu $5,$5,8 + daddu $10,$10,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$10,$2 # carry from previous addition -> $2 + sd $10,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop # should be "bnel" + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + dmultu $8,$7 + sd $10,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: mflo $10 + mfhi $9 + daddu $10,$10,$2 + sltu $2,$10,$2 + sd $10,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_mul_1 diff --git a/sysdeps/mips/mips3/rshift.s b/sysdeps/mips/mips3/rshift.s new file mode 100644 index 0000000..e0e2ca2 --- /dev/null +++ b/sysdeps/mips/mips3/rshift.s @@ -0,0 +1,91 @@ + # MIPS3 __mpn_rshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .set noreorder + .set nomacro + + ld $10,0($5) # load first limb + dsubu $13,$0,$7 + daddiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + dsll $2,$10,$13 # compute function result + + dsubu $6,$6,$9 + +.Loop0: ld $3,8($5) + daddiu $4,$4,8 + daddiu $5,$5,8 + daddiu $9,$9,-1 + dsrl $11,$10,$7 + dsll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sd $8,-8($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: ld $3,8($5) + daddiu $4,$4,32 + daddiu $6,$6,-4 + dsrl $11,$10,$7 + dsll $12,$3,$13 + + ld $10,16($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-32($4) + dsll $9,$10,$13 + + ld $3,24($5) + dsrl $11,$10,$7 + or $8,$14,$9 + sd $8,-24($4) + dsll $12,$3,$13 + + ld $10,32($5) + dsrl $14,$3,$7 + or $8,$11,$12 + sd $8,-16($4) + dsll $9,$10,$13 + + daddiu $5,$5,32 + or $8,$14,$9 + bgtz $6,.Loop + sd $8,-8($4) + +.Lend: dsrl $8,$10,$7 + j $31 + sd $8,0($4) + .end __mpn_rshift diff --git a/sysdeps/mips/mips3/sub_n.s b/sysdeps/mips/mips3/sub_n.s new file mode 100644 index 0000000..9a45ffd --- /dev/null +++ b/sysdeps/mips/mips3/sub_n.s @@ -0,0 +1,119 @@ + # MIPS3 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .set noreorder + .set nomacro + + ld $10,0($5) + ld $11,0($6) + + daddiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + dsubu $7,$7,$9 + +.Loop0: daddiu $9,$9,-1 + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + or $2,$2,$8 + + daddiu $5,$5,8 + daddiu $6,$6,8 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + daddiu $4,$4,8 + +.L0: beq $7,$0,.Lend + nop + +.Loop: daddiu $7,$7,-4 + + ld $12,8($5) + daddu $11,$11,$2 + ld $13,8($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + or $2,$2,$8 + + ld $10,16($5) + daddu $13,$13,$2 + ld $11,16($6) + sltu $8,$13,$2 + dsubu $13,$12,$13 + sltu $2,$12,$13 + sd $13,8($4) + or $2,$2,$8 + + ld $12,24($5) + daddu $11,$11,$2 + ld $13,24($6) + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,16($4) + or $2,$2,$8 + + ld $10,32($5) + daddu $13,$13,$2 + ld $11,32($6) + sltu $8,$13,$2 + dsubu $13,$12,$13 + sltu $2,$12,$13 + sd $13,24($4) + or $2,$2,$8 + + daddiu $5,$5,32 + daddiu $6,$6,32 + + bne $7,$0,.Loop + daddiu $4,$4,32 + +.Lend: daddu $11,$11,$2 + sltu $8,$11,$2 + dsubu $11,$10,$11 + sltu $2,$10,$11 + sd $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_sub_n diff --git a/sysdeps/mips/mips3/submul_1.s b/sysdeps/mips/mips3/submul_1.s new file mode 100644 index 0000000..f28c6a5 --- /dev/null +++ b/sysdeps/mips/mips3/submul_1.s @@ -0,0 +1,96 @@ + # MIPS3 __mpn_submul_1 -- Multiply a limb vector with a single limb and + # subtract the product from a second limb vector. + + # Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_submul_1 + .ent __mpn_submul_1 +__mpn_submul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + ld $8,0($5) + + # warm up phase 1 + daddiu $5,$5,8 + dmultu $8,$7 + + daddiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + daddiu $6,$6,-1 + beq $6,$0,$LC1 + ld $8,0($5) # load new s1 limb as early as possible + +Loop: ld $10,0($4) + mflo $3 + mfhi $9 + daddiu $5,$5,8 + daddu $3,$3,$2 # add old carry limb to low product limb + dmultu $8,$7 + ld $8,0($5) # load new s1 limb as early as possible + daddiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + bne $6,$0,Loop # should be "bnel" + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dmultu $8,$7 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + daddiu $4,$4,8 + daddu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: ld $10,0($4) + mflo $3 + mfhi $9 + daddu $3,$3,$2 + sltu $2,$3,$2 + dsubu $3,$10,$3 + sgtu $10,$3,$10 + daddu $2,$2,$10 + sd $3,0($4) + j $31 + daddu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_submul_1 diff --git a/sysdeps/mips/mul_1.s b/sysdeps/mips/mul_1.s new file mode 100644 index 0000000..01327e2 --- /dev/null +++ b/sysdeps/mips/mul_1.s @@ -0,0 +1,84 @@ + # MIPS __mpn_mul_1 -- Multiply a limb vector with a single limb and + # store the product in a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_mul_1 + .ent __mpn_mul_1 +__mpn_mul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: mflo $10 + mfhi $9 + addiu $5,$5,4 + addu $10,$10,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$10,$2 # carry from previous addition -> $2 + sw $10,0($4) + addiu $4,$4,4 + bne $6,$0,Loop # should be "bnel" + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + multu $8,$7 + sw $10,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: mflo $10 + mfhi $9 + addu $10,$10,$2 + sltu $2,$10,$2 + sw $10,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_mul_1 diff --git a/sysdeps/mips/rshift.s b/sysdeps/mips/rshift.s new file mode 100644 index 0000000..6941691 --- /dev/null +++ b/sysdeps/mips/rshift.s @@ -0,0 +1,91 @@ + # MIPS2 __mpn_rshift -- + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # src_ptr $5 + # size $6 + # cnt $7 + + .text + .align 2 + .globl __mpn_rshift + .ent __mpn_rshift +__mpn_rshift: + .set noreorder + .set nomacro + + lw $10,0($5) # load first limb + subu $13,$0,$7 + addiu $6,$6,-1 + and $9,$6,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + sll $2,$10,$13 # compute function result + + subu $6,$6,$9 + +.Loop0: lw $3,4($5) + addiu $4,$4,4 + addiu $5,$5,4 + addiu $9,$9,-1 + srl $11,$10,$7 + sll $12,$3,$13 + move $10,$3 + or $8,$11,$12 + bne $9,$0,.Loop0 + sw $8,-4($4) + +.L0: beq $6,$0,.Lend + nop + +.Loop: lw $3,4($5) + addiu $4,$4,16 + addiu $6,$6,-4 + srl $11,$10,$7 + sll $12,$3,$13 + + lw $10,8($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-16($4) + sll $9,$10,$13 + + lw $3,12($5) + srl $11,$10,$7 + or $8,$14,$9 + sw $8,-12($4) + sll $12,$3,$13 + + lw $10,16($5) + srl $14,$3,$7 + or $8,$11,$12 + sw $8,-8($4) + sll $9,$10,$13 + + addiu $5,$5,16 + or $8,$14,$9 + bgtz $6,.Loop + sw $8,-4($4) + +.Lend: srl $8,$10,$7 + j $31 + sw $8,0($4) + .end __mpn_rshift diff --git a/sysdeps/mips/sub_n.s b/sysdeps/mips/sub_n.s new file mode 100644 index 0000000..63f3b55 --- /dev/null +++ b/sysdeps/mips/sub_n.s @@ -0,0 +1,119 @@ + # MIPS2 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and + # store difference in a third limb vector. + + # Copyright (C) 1995 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # s2_ptr $6 + # size $7 + + .text + .align 2 + .globl __mpn_sub_n + .ent __mpn_sub_n +__mpn_sub_n: + .set noreorder + .set nomacro + + lw $10,0($5) + lw $11,0($6) + + addiu $7,$7,-1 + and $9,$7,4-1 # number of limbs in first loop + beq $9,$0,.L0 # if multiple of 4 limbs, skip first loop + move $2,$0 + + subu $7,$7,$9 + +.Loop0: addiu $9,$9,-1 + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + addiu $5,$5,4 + addiu $6,$6,4 + move $10,$12 + move $11,$13 + bne $9,$0,.Loop0 + addiu $4,$4,4 + +.L0: beq $7,$0,.Lend + nop + +.Loop: addiu $7,$7,-4 + + lw $12,4($5) + addu $11,$11,$2 + lw $13,4($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + or $2,$2,$8 + + lw $10,8($5) + addu $13,$13,$2 + lw $11,8($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,4($4) + or $2,$2,$8 + + lw $12,12($5) + addu $11,$11,$2 + lw $13,12($6) + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,8($4) + or $2,$2,$8 + + lw $10,16($5) + addu $13,$13,$2 + lw $11,16($6) + sltu $8,$13,$2 + subu $13,$12,$13 + sltu $2,$12,$13 + sw $13,12($4) + or $2,$2,$8 + + addiu $5,$5,16 + addiu $6,$6,16 + + bne $7,$0,.Loop + addiu $4,$4,16 + +.Lend: addu $11,$11,$2 + sltu $8,$11,$2 + subu $11,$10,$11 + sltu $2,$10,$11 + sw $11,0($4) + j $31 + or $2,$2,$8 + + .end __mpn_sub_n diff --git a/sysdeps/mips/submul_1.s b/sysdeps/mips/submul_1.s new file mode 100644 index 0000000..616dd1b --- /dev/null +++ b/sysdeps/mips/submul_1.s @@ -0,0 +1,96 @@ + # MIPS __mpn_submul_1 -- Multiply a limb vector with a single limb and + # subtract the product from a second limb vector. + + # Copyright (C) 1992, 1994 Free Software Foundation, Inc. + + # This file is part of the GNU MP Library. + + # The GNU MP Library is free software; you can redistribute it and/or modify + # it under the terms of the GNU Library General Public License as published by + # the Free Software Foundation; either version 2 of the License, or (at your + # option) any later version. + + # The GNU MP Library is distributed in the hope that it will be useful, but + # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY + # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public + # License for more details. + + # You should have received a copy of the GNU Library General Public License + # along with the GNU MP Library; see the file COPYING.LIB. If not, write to + # the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + + # INPUT PARAMETERS + # res_ptr $4 + # s1_ptr $5 + # size $6 + # s2_limb $7 + + .text + .align 4 + .globl __mpn_submul_1 + .ent __mpn_submul_1 +__mpn_submul_1: + .set noreorder + .set nomacro + + # warm up phase 0 + lw $8,0($5) + + # warm up phase 1 + addiu $5,$5,4 + multu $8,$7 + + addiu $6,$6,-1 + beq $6,$0,$LC0 + move $2,$0 # zero cy2 + + addiu $6,$6,-1 + beq $6,$0,$LC1 + lw $8,0($5) # load new s1 limb as early as possible + +Loop: lw $10,0($4) + mflo $3 + mfhi $9 + addiu $5,$5,4 + addu $3,$3,$2 # add old carry limb to low product limb + multu $8,$7 + lw $8,0($5) # load new s1 limb as early as possible + addiu $6,$6,-1 # decrement loop counter + sltu $2,$3,$2 # carry from previous addition -> $2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + bne $6,$0,Loop # should be "bnel" + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 1 +$LC1: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + multu $8,$7 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + addiu $4,$4,4 + addu $2,$9,$2 # add high product limb and carry from addition + + # cool down phase 0 +$LC0: lw $10,0($4) + mflo $3 + mfhi $9 + addu $3,$3,$2 + sltu $2,$3,$2 + subu $3,$10,$3 + sgtu $10,$3,$10 + addu $2,$2,$10 + sw $3,0($4) + j $31 + addu $2,$9,$2 # add high product limb and carry from addition + + .end __mpn_submul_1 diff --git a/sysdeps/rs6000/add_n.s b/sysdeps/rs6000/add_n.s new file mode 100644 index 0000000..34ad9e1 --- /dev/null +++ b/sysdeps/rs6000/add_n.s @@ -0,0 +1,54 @@ +# IBM POWER __mpn_add_n -- Add two limb vectors of equal, non-zero length. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + + .toc + .extern __mpn_add_n[DS] + .extern .__mpn_add_n +.csect [PR] + .align 2 + .globl __mpn_add_n + .globl .__mpn_add_n + .csect __mpn_add_n[DS] +__mpn_add_n: + .long .__mpn_add_n, TOC[tc0], 0 + .csect [PR] +.__mpn_add_n: + mtctr 6 # copy size into CTR + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before used + a 7,0,8 # add least significant limbs, set cy + bdz Lend # If done, skip loop +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) # store previous limb in load latecny slot + ae 7,0,8 # add new limbs with cy, set cy + bdn Loop # decrement CTR and loop back +Lend: st 7,4(3) # store ultimate result limb + lil 3,0 # load cy into ... + aze 3,3 # ... return value register + br diff --git a/sysdeps/rs6000/addmul_1.s b/sysdeps/rs6000/addmul_1.s new file mode 100644 index 0000000..862b613 --- /dev/null +++ b/sysdeps/rs6000/addmul_1.s @@ -0,0 +1,122 @@ +# IBM POWER __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + + .toc + .csect .__mpn_addmul_1[PR] + .align 2 + .globl __mpn_addmul_1 + .globl .__mpn_addmul_1 + .csect __mpn_addmul_1[DS] +__mpn_addmul_1: + .long .__mpn_addmul_1[PR], TOC[tc0], 0 + .csect .__mpn_addmul_1[PR] +.__mpn_addmul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + cax 9,9,7 + l 7,4(3) + a 8,8,7 # add res_limb + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + l 7,4(3) + aze 9,9 + a 8,8,7 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 8,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 8,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + a 8,8,7 # add res_limb + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/sysdeps/rs6000/lshift.s b/sysdeps/rs6000/lshift.s new file mode 100644 index 0000000..69c7502 --- /dev/null +++ b/sysdeps/rs6000/lshift.s @@ -0,0 +1,58 @@ +# IBM POWER __mpn_lshift -- + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 + + .toc + .extern __mpn_lshift[DS] + .extern .__mpn_lshift +.csect [PR] + .align 2 + .globl __mpn_lshift + .globl .__mpn_lshift + .csect __mpn_lshift[DS] +__mpn_lshift: + .long .__mpn_lshift, TOC[tc0], 0 + .csect [PR] +.__mpn_lshift: + sli 0,5,2 + cax 9,3,0 + cax 4,4,0 + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + lu 0,-4(4) # read most significant limb + sre 3,0,8 # compute carry out limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,-4(4) # read 2:nd most significant limb + sreq 7,0,8 # compute most significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,-4(4) # load next lower limb + stu 7,-4(9) # store previous result during read latency + sreq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,-4(9) # store 2:nd least significant limb +Lend2: sle 7,0,6 # compute least significant limb + st 7,-4(9) # store it" \ + br diff --git a/sysdeps/rs6000/mul_1.s b/sysdeps/rs6000/mul_1.s new file mode 100644 index 0000000..f4fa894 --- /dev/null +++ b/sysdeps/rs6000/mul_1.s @@ -0,0 +1,109 @@ +# IBM POWER __mpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + + .toc + .csect .__mpn_mul_1[PR] + .align 2 + .globl __mpn_mul_1 + .globl .__mpn_mul_1 + .csect __mpn_mul_1[DS] +__mpn_mul_1: + .long .__mpn_mul_1[PR], TOC[tc0], 0 + .csect .__mpn_mul_1[PR] +.__mpn_mul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 8 + ai 0,0,0 # reset carry + cax 9,9,7 + blt Lneg +Lpos: bdz Lend +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 8,0,9 + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 8,0,10 + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + cax 10,10,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,9 + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + cax 9,9,0 # adjust high limb for negative s2_limb + mfmq 0 + ae 8,0,10 + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/sysdeps/rs6000/rshift.s b/sysdeps/rs6000/rshift.s new file mode 100644 index 0000000..6056acc --- /dev/null +++ b/sysdeps/rs6000/rshift.s @@ -0,0 +1,56 @@ +# IBM POWER __mpn_rshift -- + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s_ptr r4 +# size r5 +# cnt r6 + + .toc + .extern __mpn_rshift[DS] + .extern .__mpn_rshift +.csect [PR] + .align 2 + .globl __mpn_rshift + .globl .__mpn_rshift + .csect __mpn_rshift[DS] +__mpn_rshift: + .long .__mpn_rshift, TOC[tc0], 0 + .csect [PR] +.__mpn_rshift: + sfi 8,6,32 + mtctr 5 # put limb count in CTR loop register + l 0,0(4) # read least significant limb + ai 9,3,-4 # adjust res_ptr since it's offset in the stu:s + sle 3,0,8 # compute carry limb, and init MQ register + bdz Lend2 # if just one limb, skip loop + lu 0,4(4) # read 2:nd least significant limb + sleq 7,0,8 # compute least significant limb of result + bdz Lend # if just two limb, skip loop +Loop: lu 0,4(4) # load next higher limb + stu 7,4(9) # store previous result during read latency + sleq 7,0,8 # compute result limb + bdn Loop # loop back until CTR is zero +Lend: stu 7,4(9) # store 2:nd most significant limb +Lend2: sre 7,0,6 # compute most significant limb + st 7,4(9) # store it" \ + br diff --git a/sysdeps/rs6000/sub_n.s b/sysdeps/rs6000/sub_n.s new file mode 100644 index 0000000..402fdce --- /dev/null +++ b/sysdeps/rs6000/sub_n.s @@ -0,0 +1,55 @@ +# IBM POWER __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +# store difference in a third limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# s2_ptr r5 +# size r6 + + .toc + .extern __mpn_sub_n[DS] + .extern .__mpn_sub_n +.csect [PR] + .align 2 + .globl __mpn_sub_n + .globl .__mpn_sub_n + .csect __mpn_sub_n[DS] +__mpn_sub_n: + .long .__mpn_sub_n, TOC[tc0], 0 + .csect [PR] +.__mpn_sub_n: + mtctr 6 # copy size into CTR + l 8,0(4) # load least significant s1 limb + l 0,0(5) # load least significant s2 limb + cal 3,-4(3) # offset res_ptr, it's updated before used + sf 7,0,8 # add least significant limbs, set cy + bdz Lend # If done, skip loop +Loop: lu 8,4(4) # load s1 limb and update s1_ptr + lu 0,4(5) # load s2 limb and update s2_ptr + stu 7,4(3) # store previous limb in load latecny slot + sfe 7,0,8 # add new limbs with cy, set cy + bdn Loop # decrement CTR and loop back +Lend: st 7,4(3) # store ultimate result limb + sfe 3,0,0 # load !cy into ... + sfi 3,3,0 # ... return value register + br diff --git a/sysdeps/rs6000/submul_1.s b/sysdeps/rs6000/submul_1.s new file mode 100644 index 0000000..2526332 --- /dev/null +++ b/sysdeps/rs6000/submul_1.s @@ -0,0 +1,127 @@ +# IBM POWER __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr r3 +# s1_ptr r4 +# size r5 +# s2_limb r6 + +# The RS/6000 has no unsigned 32x32->64 bit multiplication instruction. To +# obtain that operation, we have to use the 32x32->64 signed multiplication +# instruction, and add the appropriate compensation to the high limb of the +# result. We add the multiplicand if the multiplier has its most significant +# bit set, and we add the multiplier if the multiplicand has its most +# significant bit set. We need to preserve the carry flag between each +# iteration, so we have to compute the compensation carefully (the natural, +# srai+and doesn't work). Since the POWER architecture has a branch unit +# we can branch in zero cycles, so that's how we perform the additions. + + .toc + .csect .__mpn_submul_1[PR] + .align 2 + .globl __mpn_submul_1 + .globl .__mpn_submul_1 + .csect __mpn_submul_1[DS] +__mpn_submul_1: + .long .__mpn_submul_1[PR], TOC[tc0], 0 + .csect .__mpn_submul_1[PR] +.__mpn_submul_1: + + cal 3,-4(3) + l 0,0(4) + cmpi 0,6,0 + mtctr 5 + mul 9,0,6 + srai 7,0,31 + and 7,7,6 + mfmq 11 + cax 9,9,7 + l 7,4(3) + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + blt Lneg +Lpos: bdz Lend + +Lploop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 0 + ae 11,0,9 # low limb + old_cy_limb + old cy + l 7,4(3) + aze 10,10 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Lp0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Lp0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 0 + ae 11,0,10 + l 7,4(3) + aze 9,9 + sf 8,11,7 + a 11,8,11 # invert cy (r11 is junk) + bge Lp1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Lp1: bdn Lploop + + b Lend + +Lneg: cax 9,9,0 + bdz Lend +Lnloop: lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 10,0,6 + mfmq 7 + ae 11,7,9 + l 7,4(3) + ae 10,10,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln0 + cax 10,10,6 # adjust high limb for negative limb from s1 +Ln0: bdz Lend0 + lu 0,4(4) + stu 8,4(3) + cmpi 0,0,0 + mul 9,0,6 + mfmq 7 + ae 11,7,10 + l 7,4(3) + ae 9,9,0 # propagate cy to new cy_limb + sf 8,11,7 # add res_limb + a 11,8,11 # invert cy (r11 is junk) + bge Ln1 + cax 9,9,6 # adjust high limb for negative limb from s1 +Ln1: bdn Lnloop + b Lend + +Lend0: cal 9,0(10) +Lend: st 8,4(3) + aze 3,9 + br diff --git a/sysdeps/sparc/add_n.S b/sysdeps/sparc/add_n.S index 3be3e39..13704d3 100644 --- a/sysdeps/sparc/add_n.S +++ b/sysdeps/sparc/add_n.S @@ -1,7 +1,7 @@ ! sparc __mpn_add_n -- Add two limb vectors of the same length > 0 and store ! sum in a third limb vector. -! Copyright (C) 1992, 1994 Free Software Foundation, Inc. +! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. @@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_add_n): sub %g0,%o3,%o3 andcc %o3,(16-1),%o3 be Lzero - nop + mov %o4,%g2 ! put first s1_limb in g2 too sll %o3,2,%o3 ! multiply by 4 sub %o0,%o3,%o0 ! adjust res_ptr sub %o1,%o3,%o1 ! adjust s1_ptr sub %o2,%o3,%o2 ! adjust s2_ptr - mov %o4,%g2 - +#if PIC + mov %o7,%g4 ! Save return address register + call 1f + add %o7,Lbase-1f,%g3 +1: mov %g4,%o7 ! Restore return address register +#else sethi %hi(Lbase),%g3 or %g3,%lo(Lbase),%g3 +#endif sll %o3,2,%o3 ! multiply by 4 jmp %g3+%o3 - mov %o5,%g3 + mov %o5,%g3 ! put first s2_limb in g3 too Loop: addxcc %g2,%g3,%o3 add %o1,64,%o1 diff --git a/sysdeps/sparc/sparc8/addmul_1.S b/sysdeps/sparc/sparc8/addmul_1.S index fbaacfd..d1de0c3 100644 --- a/sysdeps/sparc/sparc8/addmul_1.S +++ b/sysdeps/sparc/sparc8/addmul_1.S @@ -37,8 +37,15 @@ C_SYMBOL_NAME(__mpn_addmul_1): sll %o2,4,%g1 and %g1,(4-1)<<4,%g1 +#if PIC + mov %o7,%g4 ! Save return address register + call 1f + add %o7,LL-1f,%g3 +1: mov %g4,%o7 ! Restore return address register +#else sethi %hi(LL),%g3 or %g3,%lo(LL),%g3 +#endif jmp %g3+%g1 nop LL: diff --git a/sysdeps/sparc/sparc8/mul_1.S b/sysdeps/sparc/sparc8/mul_1.S index 9c21768..42717be 100644 --- a/sysdeps/sparc/sparc8/mul_1.S +++ b/sysdeps/sparc/sparc8/mul_1.S @@ -34,8 +34,15 @@ C_SYMBOL_NAME(__mpn_mul_1): sll %o2,4,%g1 and %g1,(4-1)<<4,%g1 +#if PIC + mov %o7,%g4 ! Save return address register + call 1f + add %o7,LL-1f,%g3 +1: mov %g4,%o7 ! Restore return address register +#else sethi %hi(LL),%g3 or %g3,%lo(LL),%g3 +#endif jmp %g3+%g1 ld [%o1+0],%o4 ! 1 LL: diff --git a/sysdeps/sparc/sub_n.S b/sysdeps/sparc/sub_n.S index 7a167b2..6264344 100644 --- a/sysdeps/sparc/sub_n.S +++ b/sysdeps/sparc/sub_n.S @@ -1,7 +1,7 @@ ! sparc __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and ! store difference in a third limb vector. -! Copyright (C) 1992, 1994 Free Software Foundation, Inc. +! Copyright (C) 1992, 1994, 1995 Free Software Foundation, Inc. ! This file is part of the GNU MP Library. @@ -39,20 +39,25 @@ C_SYMBOL_NAME(__mpn_sub_n): sub %g0,%o3,%o3 andcc %o3,(16-1),%o3 be Lzero - nop + mov %o4,%g2 ! put first s1_limb in g2 too sll %o3,2,%o3 ! multiply by 4 sub %o0,%o3,%o0 ! adjust res_ptr sub %o1,%o3,%o1 ! adjust s1_ptr sub %o2,%o3,%o2 ! adjust s2_ptr - mov %o4,%g2 - +#if PIC + mov %o7,%g4 ! Save return address register + call 1f + add %o7,Lbase-1f,%g3 +1: mov %g4,%o7 ! Restore return address register +#else sethi %hi(Lbase),%g3 or %g3,%lo(Lbase),%g3 +#endif sll %o3,2,%o3 ! multiply by 4 jmp %g3+%o3 - mov %o5,%g3 + mov %o5,%g3 ! put first s2_limb in g3 too Loop: subxcc %g2,%g3,%o3 add %o1,64,%o1 diff --git a/sysdeps/unix/sysv/linux/Dist b/sysdeps/unix/sysv/linux/Dist index db5ff95..d6124bd 100644 --- a/sysdeps/unix/sysv/linux/Dist +++ b/sysdeps/unix/sysv/linux/Dist @@ -1,2 +1,3 @@ sys/socketcall.h sys/timex.h +nfs/nfs.h diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile index 6e1dd8c..fcacc53 100644 --- a/sysdeps/unix/sysv/linux/Makefile +++ b/sysdeps/unix/sysv/linux/Makefile @@ -20,7 +20,11 @@ sysdep_routines := $(sysdep_routines) ipc endif ifeq ($(subdir), socket) -headers += sys/socketcall.h +headers += sys/socketcall.h +endif + +ifeq ($(subdir), sunrpc) +headers += nfs/nfs.h endif config-LDFLAGS = -Wl,-dynamic-linker=/lib/ld-gnu.so.1 diff --git a/sysdeps/unix/sysv/linux/i386/sysdep.h b/sysdeps/unix/sysv/linux/i386/sysdep.h index 7fe4d41..a40ca86 100644 --- a/sysdeps/unix/sysv/linux/i386/sysdep.h +++ b/sysdeps/unix/sysv/linux/i386/sysdep.h @@ -93,43 +93,61 @@ Cambridge, MA 02139, USA. */ (2 * movl is less expensive than pushl + popl). Second unlike for the other registers we don't save the content of - %ecx and %edx when we have than 1 and 2 registers resp. */ + %ecx and %edx when we have than 1 and 2 registers resp. + + The code below might look a bit long but we have to take care for + the pipelined processors (i586 and up). Here the `pushl' and `popl' + instructions are marked as NP (not pairable) but the exception is + two consecutive of these instruction. This gives no penalty on + i386 and i486 processors though. */ #undef DO_CALL #define DO_CALL(args) \ + PUSHARGS_##args \ DOARGS_##args \ - int $0x80; \ - UNDOARGS_##args + int $0x80 \ + POPARGS_##args +#define PUSHARGS_0 /* No arguments to push. */ #define DOARGS_0 /* No arguments to frob. */ -#define UNDOARGS_0 /* No arguments to unfrob. */ -#define _DOARGS_0(n) /* No arguments to frob. */ -#define _UNDOARGS_0 /* No arguments to unfrob. */ - -#define DOARGS_1 movl %ebx, %edx; movl 4(%esp), %ebx; DOARGS_0 -#define UNDOARGS_1 UNDOARGS_0; movl %edx, %ebx -#define _DOARGS_1(n) pushl %ebx; movl n+4(%esp), %ebx; _DOARGS_0 (n) -#define _UNDOARGS_1 _UNDOARGS_0; popl %ebx - -#define DOARGS_2 movl 8(%esp), %ecx; DOARGS_1 -#define UNDOARGS_2 UNDOARGS_1 +#define POPARGS_0 /* No arguments to pop. */ +#define _PUSHARGS_0 /* No arguments to push. */ +#define _DOARGS_0(n) /* No arguments to frob. */ +#define _POPARGS_0 /* No arguments to pop. */ + +#define PUSHARGS_1 movl %ebx, %edx; PUSHARGS_0 +#define DOARGS_1 _DOARGS_1 (4) +#define POPARGS_1 POPARGS_0; movl %edx, %ebx +#define _PUSHARGS_1 pushl %ebx; _PUSHARGS_0 +#define _DOARGS_1(n) movl n(%esp), %ebx; _DOARGS_0(n-4) +#define _POPARGS_1 _POPARGS_0; popl %ebx + +#define PUSHARGS_2 PUSHARGS_1 +#define DOARGS_2 _DOARGS_2 (8) +#define POPARGS_2 POPARGS_1 +#define _PUSHARGS_2 _PUSHARGS_1 #define _DOARGS_2(n) movl n(%esp), %ecx; _DOARGS_1 (n-4) -#define _UNDOARGS_2 _UNDOARGS_1 +#define _POPARGS_2 _POPARGS_1 -#define DOARGS_3 _DOARGS_3 (12) -#define UNDOARGS_3 _UNDOARGS_3 +#define PUSHARGS_3 _PUSHARGS_2 +#define DOARGS_3 _DOARGS_3 (16) +#define POPARGS_3 _POPARGS_3 +#define _PUSHARGS_3 _PUSHARGS_2 #define _DOARGS_3(n) movl n(%esp), %edx; _DOARGS_2 (n-4) -#define _UNDOARGS_3 _UNDOARGS_2 - -#define DOARGS_4 _DOARGS_4 (16) -#define UNDOARGS_4 _UNDOARGS_4 -#define _DOARGS_4(n) pushl %esi; movl n+4(%esp), %esi; _DOARGS_3 (n) -#define _UNDOARGS_4 _UNDOARGS_3; popl %esi - -#define DOARGS_5 _DOARGS_5 (20) -#define UNDOARGS_5 _UNDOARGS_5 -#define _DOARGS_5(n) pushl %edi; movl n+4(%esp), %edi; _DOARGS_4 (n) -#define _UNDOARGS_5 _UNDOARGS_4; popl %edi - +#define _POPARGS_3 _POPARGS_2 + +#define PUSHARGS_4 _PUSHARGS_4 +#define DOARGS_4 _DOARGS_4 (24) +#define POPARGS_4 _POPARGS_4 +#define _PUSHARGS_4 pushl %esi; _PUSHARGS_3 +#define _DOARGS_4(n) movl n(%esp), %esi; _DOARGS_3 (n-4) +#define _POPARGS_4 _POPARGS_3; popl %esi + +#define PUSHARGS_5 _PUSHARGS_5 +#define DOARGS_5 _DOARGS_5 (32) +#define POPARGS_5 _POPARGS_5 +#define _PUSHARGS_5 pushl %edi; _PUSHARGS_4 +#define _DOARGS_5(n) movl n(%esp), %edi; _DOARGS_4 (n-4) +#define _POPARGS_5 _POPARGS_4; popl %edi #endif /* ASSEMBLER */ diff --git a/sysdeps/unix/sysv/linux/local_lim.h b/sysdeps/unix/sysv/linux/local_lim.h index bfc65bd..a1c81d8 100644 --- a/sysdeps/unix/sysv/linux/local_lim.h +++ b/sysdeps/unix/sysv/linux/local_lim.h @@ -1,6 +1,6 @@ -/* Minimum guaranteed maximum values for system limits. Hurd version. +/* Minimum guaranteed maximum values for system limits. Linux version. -Copyright (C) 1993, 1994 Free Software Foundation, Inc. +Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -18,14 +18,5 @@ License along with the GNU C Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/* Linux has a fixed limit of supplementary groups allocated with a - process. This value is determined by the size of the `groups' - member of the `task_struct' structure in . */ - -#define NGROUPS_MAX 32 - - -/* Maximum size of file names. Not all file system types support - this size but it is only a maximum value. */ - -#define NAME_MAX 255 +/* The kernel sources contain a file with all the needed information. */ +#include diff --git a/sysdeps/unix/sysv/linux/nfs/nfs.h b/sysdeps/unix/sysv/linux/nfs/nfs.h new file mode 100644 index 0000000..61e4b65 --- /dev/null +++ b/sysdeps/unix/sysv/linux/nfs/nfs.h @@ -0,0 +1 @@ +#include diff --git a/sysdeps/unix/sysv/linux/sys/param.h b/sysdeps/unix/sysv/linux/sys/param.h index 652605e..a2d4984 100644 --- a/sysdeps/unix/sysv/linux/sys/param.h +++ b/sysdeps/unix/sysv/linux/sys/param.h @@ -1,3 +1,21 @@ +/* Copyright (C) 1995 Free Software Foundation, Inc. +This file is part of the GNU C Library. + +The GNU C Library is free software; you can redistribute it and/or +modify it under the terms of the GNU Library General Public License as +published by the Free Software Foundation; either version 2 of the +License, or (at your option) any later version. + +The GNU C Library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Library General Public License for more details. + +You should have received a copy of the GNU Library General Public +License along with the GNU C Library; see the file COPYING.LIB. If +not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, +Boston, MA 02111-1307, USA. */ + #ifndef _SYS_PARAM_H #define _SYS_PARAM_H @@ -7,26 +25,21 @@ #include -/* Don't change it. H.J. */ -#ifdef OLD_LINUX -#undef MAXHOSTNAMELEN -#define MAXHOSTNAMELEN 8 /* max length of hostname */ -#endif #ifndef howmany -#define howmany(x, y) (((x)+((y)-1))/(y)) +# define howmany(x, y) (((x)+((y)-1))/(y)) #endif #ifndef roundup -#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) +# define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) #endif #define MAXPATHLEN PATH_MAX #define NOFILE OPEN_MAX /* Following the information of some of the kernel people I here assume - * that block size (i.e. the value of stat.st_blocks) for all filesystem - * is 512 bytes. If not tell me or HJ. -- Uli */ + that block size (i.e. the value of stat.st_blocks) for all filesystem + is 512 bytes. If not tell HJ, Roland, or me. -- drepper */ #define DEV_BSIZE 512 #endif diff --git a/sysdeps/vax/add_n.s b/sysdeps/vax/add_n.s new file mode 100644 index 0000000..c89b226 --- /dev/null +++ b/sysdeps/vax/add_n.s @@ -0,0 +1,47 @@ +# VAX __mpn_add_n -- Add two limb vectors of the same length > 0 and store +# sum in a third limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# s2_ptr (sp + 12) +# size (sp + 16) + +.text + .align 1 +.globl ___mpn_add_n +___mpn_add_n: + .word 0x0 + movl 16(ap),r0 + movl 12(ap),r1 + movl 8(ap),r2 + movl 4(ap),r3 + subl2 r4,r4 + +Loop: + movl (r2)+,r4 + adwc (r1)+,r4 + movl r4,(r3)+ + jsobgtr r0,Loop + + adwc r0,r0 + ret diff --git a/sysdeps/vax/addmul_1.s b/sysdeps/vax/addmul_1.s new file mode 100644 index 0000000..8e83204 --- /dev/null +++ b/sysdeps/vax/addmul_1.s @@ -0,0 +1,125 @@ +# VAX __mpn_addmul_1 -- Multiply a limb vector with a limb and add +# the result to a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___mpn_addmul_1 +___mpn_addmul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + addl2 r2,(r9)+ + adwc $0,r3 +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + addl2 r2,(r9)+ + adwc $0,r3 + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + addl2 r2,(r9)+ + adwc $0,r3 +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + addl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + addl2 r2,(r9)+ + adwc r1,r3 + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + addl2 r10,(r9)+ + adwc r1,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/sysdeps/vax/gmp-mparam.h b/sysdeps/vax/gmp-mparam.h new file mode 100644 index 0000000..687f12a --- /dev/null +++ b/sysdeps/vax/gmp-mparam.h @@ -0,0 +1,28 @@ +/* gmp-mparam.h -- Compiler/machine parameter header file. + +Copyright (C) 1991, 1993, 1994 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with the GNU MP Library; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + +#define BITS_PER_MP_LIMB 32 +#define BYTES_PER_MP_LIMB 4 +#define BITS_PER_LONGINT 32 +#define BITS_PER_INT 32 +#define BITS_PER_SHORTINT 16 +#define BITS_PER_CHAR 8 + +#define IEEE_DOUBLE_BIG_ENDIAN 0 diff --git a/sysdeps/vax/mul_1.s b/sysdeps/vax/mul_1.s new file mode 100644 index 0000000..3fe375b --- /dev/null +++ b/sysdeps/vax/mul_1.s @@ -0,0 +1,122 @@ +# VAX __mpn_mul_1 -- Multiply a limb vector with a limb and store +# the result in a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___mpn_mul_1 +___mpn_mul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + +# One might want to combine the addl2 and the store below, but that +# is actually just slower according to my timing tests. (VAX 3600) + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + movl r2,(r9)+ +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + movl r2,(r9)+ + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + movl r2,(r9)+ +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r1,r3 + addl2 r11,r2 + adwc r6,r3 + movl r2,(r9)+ + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r1,r11 + addl2 r3,r10 + adwc r6,r11 + movl r10,(r9)+ + + jsobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/sysdeps/vax/sub_n.s b/sysdeps/vax/sub_n.s new file mode 100644 index 0000000..300b4de --- /dev/null +++ b/sysdeps/vax/sub_n.s @@ -0,0 +1,47 @@ +# VAX __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and store +# difference in a third limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# s2_ptr (sp + 12) +# size (sp + 16) + +.text + .align 1 +.globl ___mpn_sub_n +___mpn_sub_n: + .word 0x0 + movl 16(ap),r0 + movl 12(ap),r1 + movl 8(ap),r2 + movl 4(ap),r3 + subl2 r4,r4 + +Loop: + movl (r2)+,r4 + sbwc (r1)+,r4 + movl r4,(r3)+ + jsobgtr r0,Loop + + adwc r0,r0 + ret diff --git a/sysdeps/vax/submul_1.s b/sysdeps/vax/submul_1.s new file mode 100644 index 0000000..875cbfd --- /dev/null +++ b/sysdeps/vax/submul_1.s @@ -0,0 +1,125 @@ +# VAX __mpn_submul_1 -- Multiply a limb vector with a limb and subtract +# the result from a second limb vector. + +# Copyright (C) 1992, 1994 Free Software Foundation, Inc. + +# This file is part of the GNU MP Library. + +# The GNU MP Library is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library General Public License as published by +# the Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. + +# The GNU MP Library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +# License for more details. + +# You should have received a copy of the GNU Library General Public License +# along with the GNU MP Library; see the file COPYING.LIB. If not, write to +# the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +# INPUT PARAMETERS +# res_ptr (sp + 4) +# s1_ptr (sp + 8) +# size (sp + 12) +# s2_limb (sp + 16) + +.text + .align 1 +.globl ___mpn_submul_1 +___mpn_submul_1: + .word 0xfc0 + movl 12(ap),r4 + movl 8(ap),r8 + movl 4(ap),r9 + movl 16(ap),r6 + jlss s2_big + + clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L1 + clrl r11 + +# Loop for S2_LIMB < 0x80000000 +Loop1: movl (r8)+,r1 + jlss L1n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc $0,r3 + subl2 r2,(r9)+ + adwc $0,r3 +L1: movl (r8)+,r1 + jlss L1n1 +L1p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc $0,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + +L1n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + subl2 r2,(r9)+ + adwc $0,r3 + movl (r8)+,r1 + jgeq L1p1 +L1n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop1 + movl r11,r0 + ret + + +s2_big: clrl r3 + incl r4 + ashl $-1,r4,r7 + jlbc r4,L2 + clrl r11 + +# Loop for S2_LIMB >= 0x80000000 +Loop2: movl (r8)+,r1 + jlss L2n0 + emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r1,r3 + subl2 r2,(r9)+ + adwc $0,r3 +L2: movl (r8)+,r1 + jlss L2n1 +L2p1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r1,r11 + subl2 r10,(r9)+ + adwc $0,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret + +L2n0: emul r1,r6,$0,r2 + addl2 r11,r2 + adwc r6,r3 + subl2 r2,(r9)+ + adwc r1,r3 + movl (r8)+,r1 + jgeq L2p1 +L2n1: emul r1,r6,$0,r10 + addl2 r3,r10 + adwc r6,r11 + subl2 r10,(r9)+ + adwc r1,r11 + + jsobgtr r7,Loop2 + movl r11,r0 + ret diff --git a/sysdeps/z8000/add_n.s b/sysdeps/z8000/add_n.s new file mode 100644 index 0000000..21efaf5 --- /dev/null +++ b/sysdeps/z8000/add_n.s @@ -0,0 +1,52 @@ +! Z8000 __mpn_add_n -- Add two limb vectors of equal, non-zero length. + +! Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Library General Public License as published by +! the Free Software Foundation; either version 2 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! License for more details. + +! You should have received a copy of the GNU Library General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + unseg + .text + even + global ___mpn_add_n +___mpn_add_n: + pop r0,@r6 + pop r1,@r5 + add r0,r1 + ld @r7,r0 + dec r4 + jr eq,Lend +Loop: pop r0,@r6 + pop r1,@r5 + adc r0,r1 + inc r7,#2 + ld @r7,r0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + adc r2,r2 + ret t diff --git a/sysdeps/z8000/mul_1.s b/sysdeps/z8000/mul_1.s new file mode 100644 index 0000000..2075225 --- /dev/null +++ b/sysdeps/z8000/mul_1.s @@ -0,0 +1,67 @@ +! Z8000 __mpn_mul_1 -- Multiply a limb vector with a limb and store +! the result in a second limb vector. + +! Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Library General Public License as published by +! the Free Software Foundation; either version 2 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! License for more details. + +! You should have received a copy of the GNU Library General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! size r5 +! s2_limb r4 + + unseg + .text + even + global ___mpn_mul_1 +___mpn_mul_1: + sub r2,r2 ! zero carry limb + and r4,r4 + jr mi,Lneg + +Lpos: pop r1,@r6 + ld r9,r1 + mult rr8,r4 + and r1,r1 ! shift msb of loaded limb into cy + jr mi,Lp ! branch if loaded limb's msb is set + add r8,r4 ! hi_limb += sign_comp2 +Lp: add r9,r2 ! lo_limb += cy_limb + xor r2,r2 + adc r2,r8 + ld @r7,r9 + inc r7,#2 + dec r5 + jr ne,Lpos + ret t + +Lneg: pop r1,@r6 + ld r9,r1 + mult rr8,r4 + add r8,r1 ! hi_limb += sign_comp1 + and r1,r1 + jr mi,Ln + add r8,r4 ! hi_limb += sign_comp2 +Ln: add r9,r2 ! lo_limb += cy_limb + xor r2,r2 + adc r2,r8 + ld @r7,r9 + inc r7,#2 + dec r5 + jr ne,Lneg + ret t diff --git a/sysdeps/z8000/sub_n.s b/sysdeps/z8000/sub_n.s new file mode 100644 index 0000000..f75ef22 --- /dev/null +++ b/sysdeps/z8000/sub_n.s @@ -0,0 +1,53 @@ +! Z8000 __mpn_sub_n -- Subtract two limb vectors of the same length > 0 and +! store difference in a third limb vector. + +! Copyright (C) 1993, 1994 Free Software Foundation, Inc. + +! This file is part of the GNU MP Library. + +! The GNU MP Library is free software; you can redistribute it and/or modify +! it under the terms of the GNU Library General Public License as published by +! the Free Software Foundation; either version 2 of the License, or (at your +! option) any later version. + +! The GNU MP Library is distributed in the hope that it will be useful, but +! WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +! or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +! License for more details. + +! You should have received a copy of the GNU Library General Public License +! along with the GNU MP Library; see the file COPYING.LIB. If not, write to +! the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + +! INPUT PARAMETERS +! res_ptr r7 +! s1_ptr r6 +! s2_ptr r5 +! size r4 + +! If we are really crazy, we can use push to write a few result words +! backwards, using push just because it is faster than reg+disp. We'd +! then add 2x the number of words written to r7... + + unseg + .text + even + global ___mpn_sub_n +___mpn_sub_n: + pop r0,@r6 + pop r1,@r5 + sub r0,r1 + ld @r7,r0 + dec r4 + jr eq,Lend +Loop: pop r0,@r6 + pop r1,@r5 + sbc r0,r1 + inc r7,#2 + ld @r7,r0 + dec r4 + jr ne,Loop +Lend: ld r2,r4 ! use 0 already in r4 + adc r2,r2 + ret t -- 2.7.4