Rewritten v9/64-bit sparc strcmp.
authorDavid S. Miller <davem@davemloft.net>
Wed, 24 Aug 2011 08:32:24 +0000 (01:32 -0700)
committerDavid S. Miller <davem@davemloft.net>
Wed, 24 Aug 2011 08:32:24 +0000 (01:32 -0700)
ChangeLog
sysdeps/sparc/sparc64/strcmp.S

index 4fde8c2..ab754e5 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2011-08-24  David S. Miller  <davem@davemloft.net>
+
+       * sysdeps/sparc/sparc64/strcmp.S: Rewrite.
+
 2011-08-24  Andreas Schwab  <schwab@redhat.com>
 
        * elf/Makefile: Add rules to build and run unload8 test.
index fade4c4..263bb40 100644 (file)
@@ -1,9 +1,8 @@
 /* Compare two strings for differences.
    For SPARC v9.
-   Copyright (C) 1997, 1999, 2003 Free Software Foundation, Inc.
+   Copyright (C) 2011 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
+   Contributed by David S. Miller <davem@davemloft.net>
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
 
 #include <sysdep.h>
 #include <asm/asi.h>
+
 #ifndef XCC
        .register       %g2, #scratch
        .register       %g3, #scratch
        .register       %g6, #scratch
 #endif
 
-       /* Normally, this uses
-          ((xword - 0x0101010101010101) & 0x8080808080808080) test
-          to find out if any byte in xword could be zero. This is fast, but
-          also gives false alarm for any byte in range 0x81-0xff. It does
-          not matter for correctness, as if this test tells us there could
-          be some zero byte, we check it byte by byte, but if bytes with
-          high bits set are common in the strings, then this will give poor
-          performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-          will use one tick slower, but more precise test
-          ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-          which does not give any false alarms (but if some bits are set,
-          one cannot assume from it which bytes are zero and which are not).
-          It is yet to be measured, what is the correct default for glibc
-          in these days for an average user.
+#define rSTR1          %o0
+#define rSTR2          %o1
+#define r0101          %o2     /* 0x0101010101010101 */
+#define r8080          %o3     /* 0x8080808080808080 */
+#define rSTRXOR                %o4
+#define rWORD1         %o5
+#define rTMP1          %g1
+#define rTMP2          %g2
+#define rWORD2         %g3
+#define rSLL           %g4
+#define rSRL           %g5
+#define rBARREL                %g6
+
+       /* There are two cases, either the two pointers are aligned
+        * identically or they are not.  If they have the same
+        * alignment we can use the normal full speed loop.  Otherwise
+        * we have to use the barrel-shifter version.
         */
 
        .text
-       .align          32
+       .align  32
 ENTRY(strcmp)
-       sethi           %hi(0x01010101), %g1                    /* IEU0         Group           */
-       andcc           %o0, 7, %g0                             /* IEU1                         */
-       bne,pn          %icc, 7f                                /* CTI                          */
-        or             %g1, %lo(0x01010101), %g1               /* IEU0         Group           */
-
-       andcc           %o1, 7, %g3                             /* IEU1                         */
-       bne,pn          %icc, 9f                                /* CTI                          */
-        sllx           %g1, 32, %g2                            /* IEU0         Group           */
-       ldx             [%o0], %o2                              /* Load                         */
-
-       or              %g1, %g2, %g1                           /* IEU0         Group           */
-1:     ldx             [%o1], %o3                              /* Load                         */
-       sub             %o1, %o0, %o1                           /* IEU1                         */
-       sllx            %g1, 7, %g2                             /* IEU0         Group           */
-
-2:     add             %o0, 8, %o0                             /* IEU1                         */
-       sub             %o2, %g1, %g3                           /* IEU0         Group           */
-       subcc           %o2, %o3, %g0                           /* IEU1                         */
-       bne,pn          %xcc, 13f                               /* CTI                          */
-
-#ifdef EIGHTBIT_NOT_RARE
-        andn           %g3, %o2, %g4                           /* IEU0         Group           */
-       ldxa            [%o0] ASI_PNF, %o2                      /* Load                         */
-       andcc           %g4, %g2, %g0                           /* IEU1         Group           */
-#else
-        ldxa           [%o0] ASI_PNF, %o2                      /* Load         Group           */
-       andcc           %g3, %g2, %g0                           /* IEU1                         */
-#endif
-       be,a,pt         %xcc, 2b                                /* CTI                          */
-        ldxa           [%o1 + %o0] ASI_PNF, %o3                /* Load         Group           */
-
-       addcc           %g3, %g1, %o4                           /* IEU1                         */
-       srlx            %g3, 32, %g3                            /* IEU0                         */
-       andcc           %g3, %g2, %g0                           /* IEU1         Group           */
-       be,pt           %xcc, 3f                                /* CTI                          */
-
-        srlx           %o4, 56, %o5                            /* IEU0                         */
-       andcc           %o5, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4f                                /* CTI                          */
-        srlx           %o4, 48, %o5                            /* IEU0                         */
-
-       andcc           %o5, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4f                                /* CTI                          */
-        srlx           %o4, 40, %o5                            /* IEU0                         */
-       andcc           %o5, 0xff, %g0                          /* IEU1         Group           */
-
-       be,pn           %icc, 4f                                /* CTI                          */
-        srlx           %o4, 32, %o5                            /* IEU0                         */
-       andcc           %o5, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4f                                /* CTI                          */
-
-3:      srlx           %o4, 24, %o5                            /* IEU0                         */
-       andcc           %o5, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4f                                /* CTI                          */
-        srlx           %o4, 16, %o5                            /* IEU0                         */
-
-       andcc           %o5, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4f                                /* CTI                          */
-        srlx           %o4, 8, %o5                             /* IEU0                         */
-       andcc           %o5, 0xff, %g0                          /* IEU1         Group           */
-
-       be,pn           %icc, 4f                                /* CTI                          */
-        andcc          %o4, 0xff, %g0                          /* IEU1         Group           */
-       bne,a,pn        %icc, 2b                                /* CTI                          */
-        ldxa           [%o1 + %o0] ASI_PNF, %o3                /* Load                         */
-
-4:     retl                                                    /* CTI+IEU1     Group           */
-        clr            %o0                                     /* IEU0                         */
-
-       .align          32
-13:    mov             0xff, %g6                               /* IEU0         Group           */
-#ifdef EIGHTBIT_NOT_RARE
-       andcc           %g4, %g2, %g0                           /* IEU1                         */
-#else
-       andcc           %g3, %g2, %g0                           /* IEU1                         */
-#endif
-       be,pt           %xcc, 25f                               /* CTI                          */
-        addcc          %g3, %g1, %o4                           /* IEU1         Group           */
-
-       srlx            %g3, 32, %g3                            /* IEU0                         */
-       andcc           %g3, %g2, %g0                           /* IEU1         Group           */
-       be,pt           %xcc, 23f                               /* CTI                          */
-        sllx           %g6, 56, %o5                            /* IEU0                         */
-
-       andcc           %o4, %o5, %g0                           /* IEU1         Group           */
-       be,pn           %xcc, 24f                               /* CTI                          */
-        sllx           %g6, 48, %o5                            /* IEU0                         */
-       andcc           %o4, %o5, %g0                           /* IEU1         Group           */
-
-       be,pn           %xcc, 24f                               /* CTI                          */
-        sllx           %g6, 40, %o5                            /* IEU0                         */
-       andcc           %o4, %o5, %g0                           /* IEU1         Group           */
-       be,pn           %xcc, 24f                               /* CTI                          */
-
-        sllx           %g6, 32, %o5                            /* IEU0                         */
-       andcc           %o4, %o5, %g0                           /* IEU1         Group           */
-       be,pn           %xcc, 24f                               /* CTI                          */
-23:     sllx           %g6, 24, %o5                            /* IEU0                         */
-
-       andcc           %o4, %o5, %g0                           /* IEU1         Group           */
-       be,pn           %icc, 24f                               /* CTI                          */
-        sllx           %g6, 16, %o5                            /* IEU0                         */
-       andcc           %o4, %o5, %g0                           /* IEU1         Group           */
-
-       be,pn           %icc, 24f                               /* CTI                          */
-        sllx           %g6, 8, %o5                             /* IEU0                         */
-       andcc           %o4, %o5, %g0                           /* IEU1         Group           */
-       be,pn           %icc, 24f                               /* CTI                          */
-
-        mov            %g6, %o5                                /* IEU0                         */
-25:    cmp             %o4, %o3                                /* IEU1         Group           */
-5:     mov             -1, %o0                                 /* IEU0                         */
-       retl                                                    /* CTI+IEU1     Group           */
-
-        movgu          %xcc, 1, %o0                            /* Single       Group           */
-
-       .align          16
-24:    sub             %o5, 1, %g6                             /* IEU0         Group           */
-       clr             %o0                                     /* IEU1                         */
-       or              %o5, %g6, %o5                           /* IEU0         Group           */
-       andn            %o4, %o5, %o4                           /* IEU0         Group           */
-
-       andn            %o3, %o5, %o3                           /* IEU1                         */
-       cmp             %o4, %o3                                /* IEU1         Group           */
-       movgu           %xcc, 1, %o0                            /* Single       Group           */
-       retl                                                    /* CTI+IEU1     Group           */
-
-        movlu          %xcc, -1, %o0                           /* Single       Group           */
-6:     retl                                                    /* CTI+IEU1     Group           */
-        mov            %o4, %o0                                /* IEU0                         */
-
-       .align          16
-7:     ldub            [%o0], %o2                              /* Load                         */
-       add             %o0, 1, %o0                             /* IEU1                         */
-       ldub            [%o1], %o3                              /* Load         Group           */
-       sllx            %g1, 32, %g2                            /* IEU0                         */
-
-8:     add             %o1, 1, %o1                             /* IEU1                         */
-       subcc           %o2, %o3, %o4                           /* IEU1         Group           */
-       bne,pn          %xcc, 6b                                /* CTI                          */
-        lduba          [%o0] ASI_PNF, %o2                      /* Load                         */
-
-       brz,pn          %o3, 4b                                 /* CTI+IEU1     Group           */
-        lduba          [%o1] ASI_PNF, %o3                      /* Load                         */
-       andcc           %o0, 7, %g0                             /* IEU1         Group           */
-       bne,a,pn        %icc, 8b                                /* CTI                          */
-
-        add            %o0, 1, %o0                             /* IEU0                         */
-       or              %g1, %g2, %g1                           /* IEU0         Group           */
-       andcc           %o1, 7, %g3                             /* IEU1                         */
-       be,a,pn         %icc, 1b                                /* CTI                          */
-
-        ldxa           [%o0] ASI_PNF, %o2                      /* Load         Group           */
-9:     sllx            %g3, 3, %g5                             /* IEU0                         */
-       mov             64, %o5                                 /* IEU1                         */
-       sub             %o1, %g3, %o1                           /* IEU0         Group           */
-
-       sub             %o5, %g5, %o5                           /* IEU1                         */
-       ldxa            [%o1] ASI_PNF, %g6                      /* Load         Group           */
-       or              %g1, %g2, %g1                           /* IEU0                         */
-       sub             %o1, %o0, %o1                           /* IEU1                         */
-
-       sllx            %g1, 7, %g2                             /* IEU0         Group           */
-       add             %o1, 8, %o1                             /* IEU1                         */
-                                                               /* %g1 = 0101010101010101
-                                                                * %g2 = 8080808080800880
-                                                                * %g5 = number of bits to shift left
-                                                                * %o5 = number of bits to shift right */
-10:    sllx            %g6, %g5, %o3                           /* IEU0         Group           */
-       ldxa            [%o1 + %o0] ASI_PNF, %g6                /* Load                         */
-
-11:    srlx            %g6, %o5, %o4                           /* IEU0         Group           */
-       ldxa            [%o0] ASI_PNF, %o2                      /* Load                         */
-       or              %o3, %o4, %o3                           /* IEU1                         */
-       add             %o0, 8, %o0                             /* IEU0         Group           */
-
-       subcc           %o2, %o3, %g0                           /* IEU1                         */
-#ifdef EIGHTBIT_NOT_RARE
-       sub             %o2, %g1, %g3                           /* IEU0         Group           */
-       bne,pn          %xcc, 13b                               /* CTI                          */
-        andn           %g3, %o2, %g4                           /* IEU0         Group           */
-
-       andcc           %g4, %g2, %g0                           /* IEU1         Group           */
-       be,pt           %xcc, 10b                               /* CTI                          */
-        srlx           %g4, 32, %g4                            /* IEU0                         */
-       andcc           %g4, %g2, %g0                           /* IEU1         Group           */
-#else
-       bne,pn          %xcc, 13b                               /* CTI                          */
-        sub            %o2, %g1, %g3                           /* IEU0         Group           */
-       andcc           %g3, %g2, %g0                           /* IEU1         Group           */
-
-       be,pt           %xcc, 10b                               /* CTI                          */
-        srlx           %g3, 32, %g3                            /* IEU0                         */
-       andcc           %g3, %g2, %g0                           /* IEU1         Group           */
-#endif
-       be,pt           %xcc, 12f                               /* CTI                          */
-
-        srlx           %o2, 56, %g3                            /* IEU0                         */
-       andcc           %g3, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4b                                /* CTI                          */
-        srlx           %o2, 48, %g3                            /* IEU0                         */
-
-       andcc           %g3, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4b                                /* CTI                          */
-        srlx           %o2, 40, %g3                            /* IEU0                         */
-       andcc           %g3, 0xff, %g0                          /* IEU1         Group           */
-
-       be,pn           %icc, 4b                                /* CTI                          */
-        srlx           %o2, 32, %g3                            /* IEU0                         */
-       andcc           %g3, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4b                                /* CTI                          */
-
-12:     srlx           %o2, 24, %g3                            /* IEU0                         */
-       andcc           %g3, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4b                                /* CTI                          */
-        srlx           %o2, 16, %g3                            /* IEU0                         */
-
-       andcc           %g3, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4b                                /* CTI                          */
-        srlx           %o2, 8, %g3                             /* IEU0                         */
-       andcc           %g3, 0xff, %g0                          /* IEU1         Group           */
-
-       be,pn           %icc, 4b                                /* CTI                          */
-        andcc          %o2, 0xff, %g0                          /* IEU1         Group           */
-       be,pn           %icc, 4b                                /* CTI                          */
-        sllx           %g6, %g5, %o3                           /* IEU0                         */
-
-       ba,pt           %xcc, 11b                               /* CTI          Group           */
-        ldxa           [%o1 + %o0] ASI_PNF, %g6                /* Load                         */
+       or      rSTR2, rSTR1, rTMP1
+       sethi   %hi(0x80808080), r8080
+
+       andcc   rTMP1, 0x7, %g0
+       bne,pn  %icc, .Lmaybe_barrel_shift
+        or     r8080, %lo(0x80808080), r8080
+       ldx     [rSTR1], rWORD1
+
+       sub     rSTR2, rSTR1, rSTR2
+       sllx    r8080, 32, rTMP1
+
+       ldx     [rSTR1 + rSTR2], rWORD2
+       or      r8080, rTMP1, r8080
+
+       ba,pt   %xcc, .Laligned_loop_entry
+        srlx   r8080, 7, r0101
+
+       .align  32
+.Laligned_loop_entry:
+.Laligned_loop:
+       add     rSTR1, 8, rSTR1
+
+       sub     rWORD1, r0101, rTMP2
+       xorcc   rWORD1, rWORD2, rSTRXOR
+       bne,pn  %xcc, .Lcommon_endstring
+
+        andn   r8080, rWORD1, rTMP1
+
+       ldxa    [rSTR1] ASI_PNF, rWORD1
+       andcc   rTMP1, rTMP2, %g0
+       be,a,pt %xcc, .Laligned_loop
+
+        ldxa   [rSTR1 + rSTR2] ASI_PNF, rWORD2
+
+.Lcommon_equal:
+       retl
+        mov    0, %o0
+
+       /* All loops terminate here once they find an unequal word.
+        * If a zero byte appears in the word before the first unequal
+        * byte, we must report zero.  Otherwise we report '1' or '-1'
+        * depending upon whether the first mis-matching byte is larger
+        * in the first string or the second, respectively.
+        *
+        * First we compute a 64-bit mask value that has "0x01" in
+        * each byte where a zero exists in rWORD1.  rSTRXOR holds the
+        * value (rWORD1 ^ rWORD2).  Therefore, if considered as an
+        * unsigned quantity, our "0x01" mask value is "greater than"
+        * rSTRXOR then a zero terminating byte comes first and
+        * therefore we report '0'.
+        *
+        * The formula for this mask is:
+        *
+        *    mask_tmp1 = ~rWORD1 & 0x8080808080808080;
+        *    mask_tmp2 = ((rWORD1 & 0x7f7f7f7f7f7f7f7f) +
+        *                 0x7f7f7f7f7f7f7f7f);
+        *
+        *    mask = ((mask_tmp1 & ~mask_tmp2) >> 7);
+        */
+.Lcommon_endstring:
+       andn    rWORD1, r8080, rTMP2
+       or      r8080, 1, %o1
+
+       mov     1, %o0
+       sub     rTMP2, %o1, rTMP2
+
+       cmp     rWORD1, rWORD2
+       andn    rTMP1, rTMP2, rTMP1
+
+       movleu  %xcc, -1, %o0
+       srlx    rTMP1, 7, rTMP1
+
+       cmp     rTMP1, rSTRXOR
+       retl
+        movgu  %xcc, 0, %o0
+
+.Lmaybe_barrel_shift:
+       sub     rSTR2, rSTR1, rSTR2
+       sllx    r8080, 32, rTMP1
+
+       or      r8080, rTMP1, r8080
+       and     rSTR1, 0x7, rTMP2
+
+       srlx    r8080, 7, r0101
+       andn    rSTR1, 0x7, rSTR1
+
+       ldxa    [rSTR1] ASI_PNF, rWORD1
+       andcc   rSTR2, 0x7, rSLL
+       sll     rTMP2, 3, rSTRXOR
+
+       bne,pn  %icc, .Lneed_barrel_shift
+        mov    -1, rTMP1
+       ldxa    [rSTR1 + rSTR2] ASI_PNF, rBARREL
+
+       srlx    rTMP1, rSTRXOR, rTMP2
+
+       orn     rWORD1, rTMP2, rWORD1
+       ba,pt   %xcc, .Laligned_loop_entry
+        orn    rBARREL, rTMP2, rWORD2
+
+.Lneed_barrel_shift:
+       sllx    rSLL, 3, rSLL
+       andn    rSTR2, 0x7, rSTR2
+
+       ldxa    [rSTR1 + rSTR2] ASI_PNF, rBARREL
+       mov     64, rTMP2
+       sub     rTMP2, rSLL, rSRL
+
+       srlx    rTMP1, rSTRXOR, rTMP1
+       add     rSTR2, 8, rSTR2
+
+       orn     rWORD1, rTMP1, rWORD1
+       sllx    rBARREL, rSLL, rWORD2
+       ldxa    [rSTR1 + rSTR2] ASI_PNF, rBARREL
+
+       add     rSTR1, 8, rSTR1
+       sub     rWORD1, r0101, rTMP2
+
+       srlx    rBARREL, rSRL, rSTRXOR
+
+       or      rWORD2, rSTRXOR, rWORD2
+
+       orn     rWORD2, rTMP1, rWORD2
+       ba,pt   %xcc, .Lbarrel_shift_loop_entry
+        andn   r8080, rWORD1, rTMP1
+
+.Lbarrel_shift_loop:
+       sllx    rBARREL, rSLL, rWORD2
+       ldxa    [rSTR1 + rSTR2] ASI_PNF, rBARREL
+
+       add     rSTR1, 8, rSTR1
+       sub     rWORD1, r0101, rTMP2
+
+       srlx    rBARREL, rSRL, rSTRXOR
+       andn    r8080, rWORD1, rTMP1
+
+       or      rWORD2, rSTRXOR, rWORD2
+
+.Lbarrel_shift_loop_entry:
+       xorcc   rWORD1, rWORD2, rSTRXOR
+       bne,pn  %xcc, .Lcommon_endstring
+
+        andcc  rTMP1, rTMP2, %g0
+       be,a,pt %xcc, .Lbarrel_shift_loop
+        ldxa   [rSTR1] ASI_PNF, rWORD1
+
+       retl
+        mov    0, %o0
 END(strcmp)
 libc_hidden_builtin_def (strcmp)