ARM: Improve armv7 memcpy performance.

author Will Newton <will.newton@linaro.org>

Wed, 7 Aug 2013 13:15:52 +0000 (14:15 +0100)

committer Will Newton <will.newton@linaro.org>

Mon, 16 Sep 2013 16:55:28 +0000 (17:55 +0100)
author Will Newton <will.newton@linaro.org>
Wed, 7 Aug 2013 13:15:52 +0000 (14:15 +0100)
committer Will Newton <will.newton@linaro.org>
Mon, 16 Sep 2013 16:55:28 +0000 (17:55 +0100)
diff --git a/ports/ChangeLog.arm b/ports/ChangeLog.arm

index 8ef09b1..35f6f77 100644 (file)
--- a/ports/ChangeLog.arm
+++ b/ports/ChangeLog.arm
@@ -1,3 +1,8 @@
+2013-09-16  Will Newton  <will.newton@linaro.org>
+
+       * sysdeps/arm/armv7/multiarch/memcpy_impl.S: Tighten check
+       on entry to aligned copy loop to improve performance.
+
  2013-08-30  Roland McGrath  <roland@hack.frob.com>
  
         * sysdeps/arm/armv6t2/strlen.S: Use sfi_pld and sfi_breg macros.
diff --git a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S

index 3decad6..ad43a3d 100644 (file)
--- a/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
+++ b/ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S
@@ -24,7 +24,6 @@
      ARMv6 (ARMv7-a if using Neon)
      ARM state
      Unaligned accesses
-    LDRD/STRD support unaligned word accesses
  
   */
  
@@ -369,8 +368,8 @@ ENTRY(memcpy)
         cfi_adjust_cfa_offset (FRAME_SIZE)
         cfi_rel_offset (tmp2, 0)
         cfi_remember_state
-       and     tmp2, src, #3
-       and     tmp1, dst, #3
+       and     tmp2, src, #7
+       and     tmp1, dst, #7
         cmp     tmp1, tmp2
         bne     .Lcpy_notaligned
  
@@ -381,9 +380,9 @@ ENTRY(memcpy)
         vmov.f32        s0, s0
  #endif
  
-       /* SRC and DST have the same mutual 32-bit alignment, but we may
+       /* SRC and DST have the same mutual 64-bit alignment, but we may
            still need to pre-copy some bytes to get to natural alignment.
-          We bring DST into full 64-bit alignment.  */
+          We bring SRC and DST into full 64-bit alignment.  */
         lsls    tmp2, dst, #29
         beq     1f
         rsbs    tmp2, tmp2, #0
@@ -515,7 +514,7 @@ ENTRY(memcpy)
  
  .Ltail63aligned:                       /* Count in tmp2.  */
         /* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
-          we know that the src and dest are 32-bit aligned so we can use
+          we know that the src and dest are 64-bit aligned so we can use
            LDRD/STRD to improve efficiency.  */
         /* TMP2 is now negative, but we don't care about that.  The bottom
            six bits still tell us how many bytes are left to copy.  */
author	Will Newton <will.newton@linaro.org>
	Wed, 7 Aug 2013 13:15:52 +0000 (14:15 +0100)
committer	Will Newton <will.newton@linaro.org>
	Mon, 16 Sep 2013 16:55:28 +0000 (17:55 +0100)
ports/ChangeLog.arm		patch \| blob \| history
ports/sysdeps/arm/armv7/multiarch/memcpy_impl.S		patch \| blob \| history