powerpc: Improve strcmp performance for shorter strings

author Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>

Tue, 7 Feb 2017 05:10:26 +0000 (10:40 +0530)

committer Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>

Tue, 7 Feb 2017 05:10:26 +0000 (10:40 +0530)
author Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Tue, 7 Feb 2017 05:10:26 +0000 (10:40 +0530)
committer Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Tue, 7 Feb 2017 05:10:26 +0000 (10:40 +0530)
diff --git a/ChangeLog b/ChangeLog

index 6399c1f..769e738 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-02-07  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+
+       * sysdeps/powerpc/powerpc64/power8/strcmp.S: Adjust address for
+       unaligned load for shorter strings.
+       * sysdeps/powerpc/powerpc64/power9/strcmp.S: Likewise.
+
  2017-02-06  Joseph Myers  <joseph@codesourcery.com>
  
         * math/libm-test-driver.c (flag_test_errno): New variable.
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S

index c34ff4a..d46bff8 100644 (file)
--- a/sysdeps/powerpc/powerpc64/power8/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -30,21 +30,21 @@
  EALIGN (strcmp, 4, 0)
         li      r0,0
  
-       /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+       /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
            the code:
  
             (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
  
-          with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+          with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
  
         rldicl  r7,r3,0,52
         rldicl  r9,r4,0,52
-       cmpldi  cr7,r7,4096-32
+       cmpldi  cr7,r7,4096-16
         bgt     cr7,L(pagecross_check)
-       cmpldi  cr5,r9,4096-32
+       cmpldi  cr5,r9,4096-16
         bgt     cr5,L(pagecross_check)
  
-       /* For short string up to 32 bytes, load both s1 and s2 using
+       /* For short string up to 16 bytes, load both s1 and s2 using
            unaligned dwords and compare.  */
         ld      r8,0(r3)
         ld      r10,0(r4)
@@ -60,25 +60,11 @@ EALIGN (strcmp, 4, 0)
         orc.    r9,r12,r11
         bne     cr0,L(different_nocmpb)
  
-       ld      r8,16(r3)
-       ld      r10,16(r4)
-       cmpb    r12,r8,r0
-       cmpb    r11,r8,r10
-       orc.    r9,r12,r11
-       bne     cr0,L(different_nocmpb)
-
-       ld      r8,24(r3)
-       ld      r10,24(r4)
-       cmpb    r12,r8,r0
-       cmpb    r11,r8,r10
-       orc.    r9,r12,r11
-       bne     cr0,L(different_nocmpb)
-
-       addi    r7,r3,32
-       addi    r4,r4,32
+       addi    r7,r3,16
+       addi    r4,r4,16
  
  L(align_8b):
-       /* Now it has checked for first 32 bytes, align source1 to doubleword
+       /* Now it has checked for first 16 bytes, align source1 to doubleword
            and adjust source2 address.  */
         rldicl  r9,r7,0,61      /* source1 alignment to doubleword  */
         subf    r4,r9,r4        /* Adjust source2 address based on source1
diff --git a/sysdeps/powerpc/powerpc64/power9/strcmp.S b/sysdeps/powerpc/powerpc64/power9/strcmp.S

index 3e32396..17ec8c2 100644 (file)
--- a/sysdeps/powerpc/powerpc64/power9/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power9/strcmp.S
@@ -65,21 +65,21 @@
  EALIGN (strcmp, 4, 0)
         li      r0, 0
  
-       /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+       /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
            the code:
  
             (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
  
-          with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+          with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
  
         rldicl  r7, r3, 0, 52
         rldicl  r9, r4, 0, 52
-       cmpldi  cr7, r7, 4096-32
+       cmpldi  cr7, r7, 4096-16
         bgt     cr7, L(pagecross_check)
-       cmpldi  cr5, r9, 4096-32
+       cmpldi  cr5, r9, 4096-16
         bgt     cr5, L(pagecross_check)
  
-       /* For short strings up to 32 bytes,  load both s1 and s2 using
+       /* For short strings up to 16 bytes,  load both s1 and s2 using
            unaligned dwords and compare.  */
         ld      r8, 0(r3)
         ld      r10, 0(r4)
@@ -95,25 +95,11 @@ EALIGN (strcmp, 4, 0)
         orc.    r9, r12, r11
         bne     cr0, L(different_nocmpb)
  
-       ld      r8, 16(r3)
-       ld      r10, 16(r4)
-       cmpb    r12, r8, r0
-       cmpb    r11, r8, r10
-       orc.    r9, r12, r11
-       bne     cr0, L(different_nocmpb)
-
-       ld      r8, 24(r3)
-       ld      r10, 24(r4)
-       cmpb    r12, r8, r0
-       cmpb    r11, r8, r10
-       orc.    r9, r12, r11
-       bne     cr0, L(different_nocmpb)
-
-       addi    r7, r3, 32
-       addi    r4, r4, 32
+       addi    r7, r3, 16
+       addi    r4, r4, 16
  
  L(align):
-       /* Now it has checked for first 32 bytes.  */
+       /* Now it has checked for first 16 bytes.  */
         vspltisb        v0, 0
         vspltisb        v2, -1
         lvsr    v6, 0, r4   /* Compute mask.  */
author	Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
	Tue, 7 Feb 2017 05:10:26 +0000 (10:40 +0530)
committer	Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
	Tue, 7 Feb 2017 05:10:26 +0000 (10:40 +0530)
ChangeLog		patch \| blob \| history
sysdeps/powerpc/powerpc64/power8/strcmp.S		patch \| blob \| history
sysdeps/powerpc/powerpc64/power9/strcmp.S		patch \| blob \| history