aarch64: Improve strcmp unaligned performance

author Siddhesh Poyarekar <siddhesh@sourceware.org>

Wed, 13 Dec 2017 13:20:27 +0000 (18:50 +0530)

committer Siddhesh Poyarekar <siddhesh@sourceware.org>

Wed, 13 Dec 2017 13:20:27 +0000 (18:50 +0530)
author Siddhesh Poyarekar <siddhesh@sourceware.org>
Wed, 13 Dec 2017 13:20:27 +0000 (18:50 +0530)
committer Siddhesh Poyarekar <siddhesh@sourceware.org>
Wed, 13 Dec 2017 13:20:27 +0000 (18:50 +0530)
diff --git a/ChangeLog b/ChangeLog

index 22df17b..a5419e1 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2017-12-13  Siddhesh Poyarekar  <siddhesh@sourceware.org>
+
+       * sysdeps/aarch64/strcmp.S (misaligned8): Compare dword at a
+       time whenever possible.
+
  2017-12-12  Carlos O'Donell <carlos@redhat.com>
  
         * elf/Makefile [$(nss-crypt)$(static-nss-crypt) == yesno]
diff --git a/sysdeps/aarch64/strcmp.S b/sysdeps/aarch64/strcmp.S

index e99d662..c260e1d 100644 (file)
--- a/sysdeps/aarch64/strcmp.S
+++ b/sysdeps/aarch64/strcmp.S
@@ -72,6 +72,7 @@ L(start_realigned):
         cbz     syndrome, L(loop_aligned)
         /* End of performance-critical section  -- one 64B cache line.  */
  
+L(end):
  #ifndef        __AARCH64EB__
         rev     syndrome, syndrome
         rev     data1, data1
@@ -145,12 +146,38 @@ L(mutual_align):
         b       L(start_realigned)
  
  L(misaligned8):
-       /* We can do better than this.  */
+       /* Align SRC1 to 8 bytes and then compare 8 bytes at a time, always
+          checking to make sure that we don't access beyond page boundary in
+          SRC2.  */
+       tst     src1, #7
+       b.eq    L(loop_misaligned)
+L(do_misaligned):
         ldrb    data1w, [src1], #1
         ldrb    data2w, [src2], #1
         cmp     data1w, #1
         ccmp    data1w, data2w, #0, cs  /* NZCV = 0b0000.  */
-       b.eq    L(misaligned8)
+       b.ne    L(done)
+       tst     src1, #7
+       b.ne    L(misaligned8)
+
+L(loop_misaligned):
+       /* Test if we are within the last dword of the end of a 4K page.  If
+          yes then jump back to the misaligned loop to copy a byte at a time.  */
+       and     tmp1, src2, #0xff8
+       eor     tmp1, tmp1, #0xff8
+       cbz     tmp1, L(do_misaligned)
+       ldr     data1, [src1], #8
+       ldr     data2, [src2], #8
+
+       sub     tmp1, data1, zeroones
+       orr     tmp2, data1, #REP8_7f
+       eor     diff, data1, data2      /* Non-zero if differences found.  */
+       bic     has_nul, tmp1, tmp2     /* Non-zero if NUL terminator.  */
+       orr     syndrome, diff, has_nul
+       cbz     syndrome, L(loop_misaligned)
+       b       L(end)
+
+L(done):
         sub     result, data1, data2
         RET
  END(strcmp)
author	Siddhesh Poyarekar <siddhesh@sourceware.org>
	Wed, 13 Dec 2017 13:20:27 +0000 (18:50 +0530)
committer	Siddhesh Poyarekar <siddhesh@sourceware.org>
	Wed, 13 Dec 2017 13:20:27 +0000 (18:50 +0530)
ChangeLog		patch \| blob \| history
sysdeps/aarch64/strcmp.S		patch \| blob \| history