AArch64: Optimize strnlen

author Wilco Dijkstra <wilco.dijkstra@arm.com>

Wed, 11 Jan 2023 13:53:05 +0000 (13:53 +0000)

committer Wilco Dijkstra <wilco.dijkstra@arm.com>

Tue, 17 Jan 2023 15:09:18 +0000 (15:09 +0000)
author Wilco Dijkstra <wilco.dijkstra@arm.com>
Wed, 11 Jan 2023 13:53:05 +0000 (13:53 +0000)
committer Wilco Dijkstra <wilco.dijkstra@arm.com>
Tue, 17 Jan 2023 15:09:18 +0000 (15:09 +0000)
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S

index 35fd148..21112fb 100644 (file)
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -44,19 +44,16 @@
  
  /*
     Core algorithm:
-
-   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   which things occur in the original string, counting trailing zeros identifies
-   exactly which byte matched.  */
+   Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+   four bits per byte using the shrn instruction. A count trailing zeros then
+   identifies the first zero byte.  */
  
  ENTRY (__strnlen)
         PTR_ARG (0)
         SIZE_ARG (1)
         bic     src, srcin, 15
         cbz     cntin, L(nomatch)
-       ld1     {vdata.16b}, [src], 16
+       ld1     {vdata.16b}, [src]
         cmeq    vhas_chr.16b, vdata.16b, 0
         lsl     shift, srcin, 2
         shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
@@ -71,36 +68,40 @@ L(finish):
         csel    result, cntin, result, ls
         ret
  
+L(nomatch):
+       mov     result, cntin
+       ret
+
  L(start_loop):
         sub     tmp, src, srcin
+       add     tmp, tmp, 17
         subs    cntrem, cntin, tmp
-       b.ls    L(nomatch)
+       b.lo    L(nomatch)
  
         /* Make sure that it won't overread by a 16-byte chunk */
-       add     tmp, cntrem, 15
-       tbnz    tmp, 4, L(loop32_2)
-
+       tbz     cntrem, 4, L(loop32_2)
+       sub     src, src, 16
         .p2align 5
  L(loop32):
-       ldr     qdata, [src], 16
+       ldr     qdata, [src, 32]!
         cmeq    vhas_chr.16b, vdata.16b, 0
         umaxp   vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
         fmov    synd, dend
         cbnz    synd, L(end)
  L(loop32_2):
-       ldr     qdata, [src], 16
+       ldr     qdata, [src, 16]
         subs    cntrem, cntrem, 32
         cmeq    vhas_chr.16b, vdata.16b, 0
-       b.ls    L(end)
+       b.lo    L(end_2)
         umaxp   vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
         fmov    synd, dend
         cbz     synd, L(loop32)
-
+L(end_2):
+       add     src, src, 16
  L(end):
         shrn    vend.8b, vhas_chr.8h, 4         /* 128->64 */
-       sub     src, src, 16
-       mov     synd, vend.d[0]
         sub     result, src, srcin
+       fmov    synd, dend
  #ifndef __AARCH64EB__
         rbit    synd, synd
  #endif
@@ -110,10 +111,6 @@ L(end):
         csel    result, cntin, result, ls
         ret
  
-L(nomatch):
-       mov     result, cntin
-       ret
-
  END (__strnlen)
  libc_hidden_def (__strnlen)
  weak_alias (__strnlen, strnlen)
author	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Wed, 11 Jan 2023 13:53:05 +0000 (13:53 +0000)
committer	Wilco Dijkstra <wilco.dijkstra@arm.com>
	Tue, 17 Jan 2023 15:09:18 +0000 (15:09 +0000)