arch/arm64/lib/strncmp.S

   1 /* SPDX-License-Identifier: GPL-2.0-only */
   2 /*
   3  * Copyright (c) 2013-2021, Arm Limited.
   4  *
   5  * Adapted from the original at:
   6  * https://github.com/ARM-software/optimized-routines/blob/e823e3abf5f89ecb/string/aarch64/strncmp.S
   7  */
   8
   9 #include <linux/linkage.h>
  10 #include <asm/assembler.h>
  11
  12 /* Assumptions:
  13  *
  14  * ARMv8-a, AArch64
  15  */
  16
  17 #define L(label) .L ## label
  18
  19 #define REP8_01 0x0101010101010101
  20 #define REP8_7f 0x7f7f7f7f7f7f7f7f
  21 #define REP8_80 0x8080808080808080
  22
  23 /* Parameters and result.  */
  24 #define src1            x0
  25 #define src2            x1
  26 #define limit           x2
  27 #define result          x0
  28
  29 /* Internal variables.  */
  30 #define data1           x3
  31 #define data1w          w3
  32 #define data2           x4
  33 #define data2w          w4
  34 #define has_nul         x5
  35 #define diff            x6
  36 #define syndrome        x7
  37 #define tmp1            x8
  38 #define tmp2            x9
  39 #define tmp3            x10
  40 #define zeroones        x11
  41 #define pos             x12
  42 #define limit_wd        x13
  43 #define mask            x14
  44 #define endloop         x15
  45 #define count           mask
  46
  47 SYM_FUNC_START_WEAK_PI(strncmp)
  48         cbz     limit, L(ret0)
  49         eor     tmp1, src1, src2
  50         mov     zeroones, #REP8_01
  51         tst     tmp1, #7
  52         and     count, src1, #7
  53         b.ne    L(misaligned8)
  54         cbnz    count, L(mutual_align)
  55         /* Calculate the number of full and partial words -1.  */
  56         sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
  57         lsr     limit_wd, limit_wd, #3  /* Convert to Dwords.  */
  58
  59         /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
  60            (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
  61            can be done in parallel across the entire word.  */
  62         .p2align 4
  63 L(loop_aligned):
  64         ldr     data1, [src1], #8
  65         ldr     data2, [src2], #8
  66 L(start_realigned):
  67         subs    limit_wd, limit_wd, #1
  68         sub     tmp1, data1, zeroones
  69         orr     tmp2, data1, #REP8_7f
  70         eor     diff, data1, data2      /* Non-zero if differences found.  */
  71         csinv   endloop, diff, xzr, pl  /* Last Dword or differences.  */
  72         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
  73         ccmp    endloop, #0, #0, eq
  74         b.eq    L(loop_aligned)
  75         /* End of main loop */
  76
  77         /* Not reached the limit, must have found the end or a diff.  */
  78         tbz     limit_wd, #63, L(not_limit)
  79
  80         /* Limit % 8 == 0 => all bytes significant.  */
  81         ands    limit, limit, #7
  82         b.eq    L(not_limit)
  83
  84         lsl     limit, limit, #3        /* Bits -> bytes.  */
  85         mov     mask, #~0
  86 #ifdef __AARCH64EB__
  87         lsr     mask, mask, limit
  88 #else
  89         lsl     mask, mask, limit
  90 #endif
  91         bic     data1, data1, mask
  92         bic     data2, data2, mask
  93
  94         /* Make sure that the NUL byte is marked in the syndrome.  */
  95         orr     has_nul, has_nul, mask
  96
  97 L(not_limit):
  98         orr     syndrome, diff, has_nul
  99
 100 #ifndef __AARCH64EB__
 101         rev     syndrome, syndrome
 102         rev     data1, data1
 103         /* The MS-non-zero bit of the syndrome marks either the first bit
 104            that is different, or the top bit of the first zero byte.
 105            Shifting left now will bring the critical information into the
 106            top bits.  */
 107         clz     pos, syndrome
 108         rev     data2, data2
 109         lsl     data1, data1, pos
 110         lsl     data2, data2, pos
 111         /* But we need to zero-extend (char is unsigned) the value and then
 112            perform a signed 32-bit subtraction.  */
 113         lsr     data1, data1, #56
 114         sub     result, data1, data2, lsr #56
 115         ret
 116 #else
 117         /* For big-endian we cannot use the trick with the syndrome value
 118            as carry-propagation can corrupt the upper bits if the trailing
 119            bytes in the string contain 0x01.  */
 120         /* However, if there is no NUL byte in the dword, we can generate
 121            the result directly.  We can't just subtract the bytes as the
 122            MSB might be significant.  */
 123         cbnz    has_nul, 1f
 124         cmp     data1, data2
 125         cset    result, ne
 126         cneg    result, result, lo
 127         ret
 128 1:
 129         /* Re-compute the NUL-byte detection, using a byte-reversed value.  */
 130         rev     tmp3, data1
 131         sub     tmp1, tmp3, zeroones
 132         orr     tmp2, tmp3, #REP8_7f
 133         bic     has_nul, tmp1, tmp2
 134         rev     has_nul, has_nul
 135         orr     syndrome, diff, has_nul
 136         clz     pos, syndrome
 137         /* The MS-non-zero bit of the syndrome marks either the first bit
 138            that is different, or the top bit of the first zero byte.
 139            Shifting left now will bring the critical information into the
 140            top bits.  */
 141         lsl     data1, data1, pos
 142         lsl     data2, data2, pos
 143         /* But we need to zero-extend (char is unsigned) the value and then
 144            perform a signed 32-bit subtraction.  */
 145         lsr     data1, data1, #56
 146         sub     result, data1, data2, lsr #56
 147         ret
 148 #endif
 149
 150 L(mutual_align):
 151         /* Sources are mutually aligned, but are not currently at an
 152            alignment boundary.  Round down the addresses and then mask off
 153            the bytes that precede the start point.
 154            We also need to adjust the limit calculations, but without
 155            overflowing if the limit is near ULONG_MAX.  */
 156         bic     src1, src1, #7
 157         bic     src2, src2, #7
 158         ldr     data1, [src1], #8
 159         neg     tmp3, count, lsl #3     /* 64 - bits(bytes beyond align). */
 160         ldr     data2, [src2], #8
 161         mov     tmp2, #~0
 162         sub     limit_wd, limit, #1     /* limit != 0, so no underflow.  */
 163 #ifdef __AARCH64EB__
 164         /* Big-endian.  Early bytes are at MSB.  */
 165         lsl     tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 166 #else
 167         /* Little-endian.  Early bytes are at LSB.  */
 168         lsr     tmp2, tmp2, tmp3        /* Shift (count & 63).  */
 169 #endif
 170         and     tmp3, limit_wd, #7
 171         lsr     limit_wd, limit_wd, #3
 172         /* Adjust the limit. Only low 3 bits used, so overflow irrelevant.  */
 173         add     limit, limit, count
 174         add     tmp3, tmp3, count
 175         orr     data1, data1, tmp2
 176         orr     data2, data2, tmp2
 177         add     limit_wd, limit_wd, tmp3, lsr #3
 178         b       L(start_realigned)
 179
 180         .p2align 4
 181         /* Don't bother with dwords for up to 16 bytes.  */
 182 L(misaligned8):
 183         cmp     limit, #16
 184         b.hs    L(try_misaligned_words)
 185
 186 L(byte_loop):
 187         /* Perhaps we can do better than this.  */
 188         ldrb    data1w, [src1], #1
 189         ldrb    data2w, [src2], #1
 190         subs    limit, limit, #1
 191         ccmp    data1w, #1, #0, hi      /* NZCV = 0b0000.  */
 192         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 193         b.eq    L(byte_loop)
 194 L(done):
 195         sub     result, data1, data2
 196         ret
 197         /* Align the SRC1 to a dword by doing a bytewise compare and then do
 198            the dword loop.  */
 199 L(try_misaligned_words):
 200         lsr     limit_wd, limit, #3
 201         cbz     count, L(do_misaligned)
 202
 203         neg     count, count
 204         and     count, count, #7
 205         sub     limit, limit, count
 206         lsr     limit_wd, limit, #3
 207
 208 L(page_end_loop):
 209         ldrb    data1w, [src1], #1
 210         ldrb    data2w, [src2], #1
 211         cmp     data1w, #1
 212         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
 213         b.ne    L(done)
 214         subs    count, count, #1
 215         b.hi    L(page_end_loop)
 216
 217 L(do_misaligned):
 218         /* Prepare ourselves for the next page crossing.  Unlike the aligned
 219            loop, we fetch 1 less dword because we risk crossing bounds on
 220            SRC2.  */
 221         mov     count, #8
 222         subs    limit_wd, limit_wd, #1
 223         b.lo    L(done_loop)
 224 L(loop_misaligned):
 225         and     tmp2, src2, #0xff8
 226         eor     tmp2, tmp2, #0xff8
 227         cbz     tmp2, L(page_end_loop)
 228
 229         ldr     data1, [src1], #8
 230         ldr     data2, [src2], #8
 231         sub     tmp1, data1, zeroones
 232         orr     tmp2, data1, #REP8_7f
 233         eor     diff, data1, data2      /* Non-zero if differences found.  */
 234         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
 235         ccmp    diff, #0, #0, eq
 236         b.ne    L(not_limit)
 237         subs    limit_wd, limit_wd, #1
 238         b.pl    L(loop_misaligned)
 239
 240 L(done_loop):
 241         /* We found a difference or a NULL before the limit was reached.  */
 242         and     limit, limit, #7
 243         cbz     limit, L(not_limit)
 244         /* Read the last word.  */
 245         sub     src1, src1, 8
 246         sub     src2, src2, 8
 247         ldr     data1, [src1, limit]
 248         ldr     data2, [src2, limit]
 249         sub     tmp1, data1, zeroones
 250         orr     tmp2, data1, #REP8_7f
 251         eor     diff, data1, data2      /* Non-zero if differences found.  */
 252         bics    has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
 253         ccmp    diff, #0, #0, eq
 254         b.ne    L(not_limit)
 255
 256 L(ret0):
 257         mov     result, #0
 258         ret
 259
 260 SYM_FUNC_END_PI(strncmp)
 261 EXPORT_SYMBOL_NOHWKASAN(strncmp)